*---------------------------------------------------------*
* JJ First created 04 / 02 / 2015 ------------------------*
* JJ Last updated 04 / 02 / 2015 -------------------------*
* --------------------------------------------------------*

* This do-file runs through all the steps outlined in computer workshop 1 
* (Including the answers to all worksheet tasks and questions)

/*
*----------------------------------------------------------------*
*1. CREATE THE WORKSHOP DATA ------------------------------------*
*----------------------------------------------------------------*
* Note: The data I have provided to you as part of computer workshop 1
* is based upon a slightly altered version of the original TALIS 2013
* data for Sweden. The below outlines how these adjustments have been made

	*A. Open up the data
	cd "C:\Users\john\Pictures\STATA_VERSION_9\TALIS\International\20140109_Sendout\International Database\Data\"
	use "BTGSWET2.dta", clear															/* Use Swedish data only */

	*B. Replace teaching hours as missing if implausible value
	replace TT2G16 = . if TT2G16 > 90													/* Set to missing if time spent on teaching > 90 hours */

	*C. Drop any observation where teacher age is above 75 
	drop if TT2G02 > 75																	/* Drop all teachers who are agred greater than 75 */

	*D. Replace the self-efficacy variable with missing if > 20 (impluasible value)
	replace TSELEFFS = . if TSELEFFS > 20

	*E. Create new variable for PT status
	gen PT = .																		/* Create new variable. Set to missing by default */
	replace PT = 0 if TT2G03 == 1													/* Equal to 0 if work FT */
	replace PT = 1 if TT2G03 == 2													/* Equal to 1 if works 71 - 90% of FT hours */
	replace PT = 1 if TT2G03 == 3													/* Equal to 1 if works 51 - 70% of FT hours */
	replace PT = 1 if TT2G03 == 4													/* Equal to 1 if works 50%< of FT hours */

	label define ///																/* Create variable label */
		PT ///																		/* For the PT variable */
		0 "Works full-time" ///														/* 0 = Full-time work */
		1 "Works part-time"															/* 1 = Part-time work */
		
	label value ///																	/*	Assign the label to the variable */
		PT PT
	
	*F. Replace satisfied with profession scale with missing values if > 90
	replace TJSPROS = . if TJSPROS > 90	

	*G. Keep only those variables that are needed for the CW
	keep ///
		PT TT2G02 TCHWGT TSELEFFS TRWGT* TT2G16 TT2G46J ///
		TT2G03 TT2G02 TT2G01 TJSPROS IDSCHOOL

	*H. Save the final recoded computer workshop data
	save "CW_1" , replace															/* Save dataset as CW_1 */
*/

*----------------------------------------------------------------*
*2. SECTION ON TEACHER WEIGHTS ----------------------------------*
*----------------------------------------------------------------*
	*A. Open the data CW1 dataset
	use  "CW_1" , clear																/* Open the dataset */

	*B. Calculating mean teacher age with / without the weights
	mean TT2G02 																	/* No weight applied */
	mean TT2G02 [iweight =  TCHWGT]													/* Teacher weight applied (via iweight) */ 
	mean TT2G02 [pweight =  TCHWGT]													/* Teacher weight applied (via pweight) */

	*C. Create self-efficacy quartiles (CORRECT WAY)
	xtile Efficacy_Quart = TSELEFFS ///												/* Use the xtile command */	
		[pweight =  TCHWGT] ///														/* Specify the probability weight */
		, nq(4)																		/* Split variable into four quartiles */

	tab Efficacy_Quart [aweight =  TCHWGT]											/* Look at the distribution of quartiles created */

	mean TT2G02 [pweight =  TCHWGT] if Efficacy_Quart == 1							/* Mean teacher age in bottom efficacy quartile */
	mean TT2G02 [pweight =  TCHWGT] if Efficacy_Quart == 4							/* Mean teacher age in top efficacy quartile */

	*D. Creating self-efficacy quartile (Incorrect way)
	*NO WEIGHT APPLIED
	xtile Efficacy_Quart_ALT = TSELEFFS ///											/* Use the xtile command */						
		, nq(4)																		/* Split into four equal groups */

	tab Efficacy_Quart_ALT [aweight =  TCHWGT]										/* Look at the distribution of quartiles created */

	mean TT2G02 [pweight =  TCHWGT] if Efficacy_Quart_ALT == 1						/* Mean teacher age in bottom efficacy quartile */
	mean TT2G02 [pweight =  TCHWGT] if Efficacy_Quart_ALT == 4						/* Mean teacher age in top efficacy quartile */
	
	*E. DIVIDE TEACHER AGE INTO QUARTILES 
	xtile Age_Quart = TT2G02 ///													/* Divide the age variable into quartiles */
		[pweight =  TCHWGT] ///														/* APPLY THE FINAL TEACHER WEIGHT */
		, nq(4)																		/* Divide into four groups */

	*F. Proportion of teachers working part-time by age quartile
	mean PT [pweight =  TCHWGT] if Age_Quart == 1									/* Youngest quartile */
	mean PT [pweight =  TCHWGT] if Age_Quart == 2									/* Q2 */
	mean PT [pweight =  TCHWGT] if Age_Quart == 3									/* Q3 */
	mean PT [pweight =  TCHWGT] if Age_Quart == 4									/* Oldest age quartile */

********************************************************************************
* ------------------------------------------------------------ *
* 3. MANUAL APPLICATION OF BRR WEIGHTS ----------------------- *	
* ------------------------------------------------------------ *
	*Step 1: Calculate the point estimate 
	mean TT2G02 [iweight =  TCHWGT]													/* Average of teacher age applying the TEACHER weight */
	gen Theta_Star = _b[TT2G02]														/* Create theta* variable */
	browse Theta_Star																/* Look at the created data */

	*Step 2: Calculate each replicate estimate 
	mean TT2G02 [iweight =  TRWGT1]													/* Average of teacher age applying the FIRST REPLICATE weight */
	gen Theta_1 = _b[TT2G02]														/* Create theta1 (first replicate estimate) */
	
	*Step 3: Create 100 replicate estimates
	drop Theta_1																	/* Drop the first replicate estimate */

	forvalues rep = 1(1)100 {														/* Looping over the 100 replicate weights */
		mean TT2G02 [iweight =  TRWGT`rep']											/* Calculate average age using each replicate */
		gen Theta_`rep' = _b[TT2G02]												/* Create estimate of Theta using each replicate */
		}

	browse Theta_* 																	/* Look at the 100 replicates */

	*Step 4. Calculate the squared differences 
	gen Theta_Diff_Squ_1 = (Theta_1 - Theta_Star) ^2								/* EXAMPLE: For first replicate only */

	drop Theta_Diff_Squ_1															/* Drop the variable created above */

	forvalues rep = 1(1)100 {														/* Looping over the 100 replicate weights */
		gen Theta_Diff_Squ_`rep' = (Theta_`rep' - Theta_Star) ^2					/* Create differecnce between each replicate and final estimate */ 
		}																			/* CLOSE LOOP */

	browse Theta_Diff_Squ_*															/* Look at the create variable */

	*Step 5: Calculate the sum of these squared differences 
	egen SUM_DIFF = rowtotal(Theta_Diff_Squ*)										

	
	*Step 5: Calculate the adjustment we must multiple the sum-of-squares by 
	gen Adjustment_Factor = ( 1 / (100*0.5^2) )										/* 0.5 = Fay's adjustment. See lecture notes */

	*Step 6:Calculate the total sampling variance and standard error
	gen Samp_Var = SUM_DIFF * Adjustment_Factor										/* Equal to the squared differences by the adjustment factor */
	gen SE = sqrt(Samp_Var)															/* Standard error equal to the square root */
	display SE																		/* Display the standard error */

	*Step 7. Create and display the confidence interval
	gen LOWER_BOUND = Theta_Star - 1.985 * SE										/* 1.985 becase 99 degrees of freedom */
	gen UPPER_BOUND = Theta_Star + 1.985 * SE										

	display LOWER_BOUND 															/* Display lower confidence limit */	
	display UPPER_BOUND																/* Displat upper confidence limit */

********************************************************************************

* ------------------------------------------------------------ *
* 4. SVY APPLICATION OF BRR WEIGHTS -------------------------- *	
* ------------------------------------------------------------ *
	* Step 1: Tell Stata what complex survey design is used
	svyset ///																		/* Define the svyset */												
		[iweight= TCHWGT] ///														/* Apply the teacher weight */
		, ///																		/* Using following options */
		brrweight(TRWGT1 - TRWGT100) ///											/* Apply the 100 BRR weights */
		vce(brr) ///																/* BRR variance estimates */
		fay(.5) ///																	/* Make Fay's adjustment 0.5 */
		mse																			/* MSE adjustment */

	* Step 2: Estimate the statistic of interest
	svy: mean TT2G02																/* Estimate mean age applying the BRR */
	
	* Question XX: Proportion teachers male vs female
	svy:proportion TT2G01															/* Proportion of male vs female applying BRR */
	
	* Question 3a: The TALIS dataset includes the variable TT2G46J. What does this variable capture?
	tab TT2G46J																		
	
	* Question 3b: Create working hour deciles
	xtile HOUR_DECILE = TT2G16 ///													/* Create working hour deciles */
		[pw = TCHWGT] ///															/* Apply the teacher weight */
		, nq(10)																	/* Split into ten equal groups */
	
	* Question 3c: What proportion of teachers in the lowest working hours decile strongly agree with this statement? 
	svy: proportion TT2G46J ///														/* Apply BRR weights */
		if HOUR_DECILE == 1															/* Lowest working hours decile */
	
	* Question 3d. How does this compare to the proportion from the top decille (the 10 percent who work the longest hours?
	svy: proportion TT2G46J  ///													/* Apply BRR weights */
		if HOUR_DECILE == 10														/* Tope working hour decile */

	* Question 4. Question XX. What is the 75th percentile of the teacher age distribution?
	* (The variable name for teacher age is TT2G02). Can you calculate the standard error (applying the BRR weights?)
	* YOU CAN'T ANSWER THIS QUESTION USING SVY COMMAND. GIVES FOLLOWING ERROR MESSAGE
	*svy: qreg TT2G02 , q(0.75)
	*svy: sum TT2G02 , d

* ------------------------------------------------------------ *
* 5. STATA repest command ------------------------------------ *	
* ------------------------------------------------------------ *	
	* Put teacher weights and replicates into lower case
	rename TCHWGT , lower 															/*** Final teacher weight ***/
	rename TRWGT* ,  lower															/*** Each of the replicate weights ***/

	* Get repest estimate of teacher average age
	repest TALISTCH, estimate(means TT2G02)
	
	* Question 5: Can you now use the repest command to answer question XX above? 
	* (i.e. find the 75th percentile of the Swedish teacher age distribution).
	repest TALISTCH, estimate(summarize TT2G02 , stats(p75) )
	
	* Question 6: Can you now calculate p5, p10, p25, p50, p75, p90 and p95 for the age distribution of Swedish teachers.
	repest TALISTCH , /// 
		estimate ( summarize TT2G02 , stats(p5 p10 p25 p50 p75 p90 p95) )

	
* ------------------------------------------------------------------------ *
* 6. Does accounting for complex survey design actually make a difference? 
* ------------------------------------------------------------------------ *	
	* Treat as SRS
	regress TJSPROS ///																/* OLS Regression model of satisfaction with profession scale */
		i.TT2G03 TT2G02 i.TT2G01													/* Control employment status, age and gender */

	* Apply final weight
	regress TJSPROS ///																/* OLS Regression model of satisfaction with profession scale */
		i.TT2G03 TT2G02 i.TT2G01 ///												/* Control employment status, age and gender */
		[pw = tchwgt]

	* Cluter standard errors by school
	regress TJSPROS ///																/* OLS Regression model of satisfaction with profession scale */															
		i.TT2G03 TT2G02 i.TT2G01 [pw = tchwgt] ///									/* Control employment status, age and gender */
		, cluster (IDSCHOOL)														/* Cluster by school ID */

	* Question 7: Finally, please complete the final row in the table above by estimating this model
	* applying both the BRR and final teacher weights (you can use any method you like). 
		svyset ///																	/* Define the svyset */
		[iweight= tchwgt] ///														/* Apply the teacher weight */
		, ///																		/* Apply the following options... */		
		brrweight(trwgt1 - trwgt100) ///											/* Apply the 100 BRR weights */
		vce(brr) ///																/* BRR variance estimates */
		fay(.5) ///																	/* Make Fay's adjustment 0.5 */
		mse																			/* MSE adjustment */
	
	svy: regress TJSPROS ///														/* OLS Regression model of satisfaction with profession scale */
		i.TT2G03 TT2G02 i.TT2G01 													/* Control employment status, age and gender */
	