clear
set obs 150
gen sid = .
*Generate dataset with missing id values
*75 observations of school 1 and school 2
forvalues x = 1/75 {
replace sid = 1 in x' } forvalues x = 76/150 { replace sid = 2 in
x'
}
*Create student ID variable
*Only 50 observations for each school have a student ID value
gen id = ""
forvalues x = 1/50 {
replace id = "K" + "-x'" in
x'
}
forvalues x = 76/125 {
replace id = "J" + "-x'" in
x'
}
*Split the ID variable into a numeric and string component
gen split = strpos(id, "-")
gen sid_string = substr(id,1,split)
sort sid sid_string
gen id_num = substr(id, split+1,.)
destring id_num, replace
*FIND THE MAXIMUM FOR EACH SCHOOL ID
forvalues sid = 1/2 {
summarize id_num if sid == sid' *extract the maximum value for each sid local max = r(max)+1 local n1 = r(N) *the number of observations (including missing IDs) for each school summarize sid if sid ==
sid'
local n = r(N)-n1' local max_n =
max' + `n' - 1
forvalues x = `max'/`max_n' {
gsort sid + id_num
replace id_num = `x' if id_num == . & sid == `sid' & id_num[_n-1]==`x'-1
}
}
*replace the missing ids with the generated ids
tostring id_num, replace
replace id = "K-" + id_num if id == "" & sid == 1
replace id = "J-" + id_num if id == "" & sid == 2