knitr::opts_chunk$set(
warning = FALSE, # show warnings during codebook generation
message = FALSE, # show messages during codebook generation
error = TRUE, # do not interrupt codebook generation in case of errors,
# usually better for debugging
echo = TRUE # show R code
)
ggplot2::theme_set(ggplot2::theme_bw())
# set base directory
basedir = "/home/ecco_rais/data/clean/RAIS-homogenized/output/"
# adjust as necessary
startyear = 2003
endyear = 2004
# libraries
library(codebook)
library(rio)
# Start the codebook loop
#for ( year in startyear:endyear) {
year = "2004"
# for CSV
codebook_data <- fread(file.path(basedir, paste0("harmo_", year, ".csv")))
# omit the following lines, if your missing values are already properly labelled
codebook_data <- detect_missing(codebook_data,
only_labelled = TRUE, # only labelled values are autodetected as
# missing
negative_values_are_missing = FALSE, # negative values are missing values
ninety_nine_problems = TRUE, # 99/999 are missing values, if they
# are more than 5 MAD from the median
)
# If you are not using formr, the codebook package needs to guess which items
# form a scale. The following line finds item aggregates with names like this:
# scale = scale_1 + scale_2R + scale_3R
# identifying these aggregates allows the codebook function to
# automatically compute reliabilities.
# However, it will not reverse items automatically.
codebook_data <- detect_scales(codebook_data)
## Warning in detect_scales(codebook_data): cnae items found, but no aggregate
## Warning in detect_scales(codebook_data): active items found, but no aggregate
## Warning in detect_scales(codebook_data): cbo items found, but no aggregate
codebook_data <- as.data.table(codebook_data)
codebook(codebook_data)
## Warning: Couldn't find skimmers for class: integer64; No user-defined `sfl` provided. Falling
## back to `character`.
## Warning: Couldn't find skimmers for class: integer64; No user-defined `sfl` provided. Falling
## back to `character`.
## Warning: Couldn't find skimmers for class: integer64; No user-defined `sfl` provided. Falling
## back to `character`.
## Warning: Couldn't find skimmers for class: integer64; No user-defined `sfl` provided. Falling
## back to `character`.
## Warning in max(f): no non-missing arguments to max; returning -Inf
## Warning: Couldn't find skimmers for class: integer64; No user-defined `sfl` provided. Falling
## back to `character`.
## Warning: Couldn't find skimmers for class: integer64; No user-defined `sfl` provided. Falling
## back to `character`.
## Warning: Couldn't find skimmers for class: integer64; No user-defined `sfl` provided. Falling
## back to `character`.
## Warning: Couldn't find skimmers for class: integer64; No user-defined `sfl` provided. Falling
## back to `character`.
## Warning: Couldn't find skimmers for class: integer64; No user-defined `sfl` provided. Falling
## back to `character`.
## Warning: Couldn't find skimmers for class: integer64; No user-defined `sfl` provided. Falling
## back to `character`.
## Warning: Couldn't find skimmers for class: integer64; No user-defined `sfl` provided. Falling
## back to `character`.
## Warning: Couldn't find skimmers for class: integer64; No user-defined `sfl` provided. Falling
## back to `character`.
## Warning: Couldn't find skimmers for class: integer64; No user-defined `sfl` provided. Falling
## back to `character`.
## Warning: Couldn't find skimmers for class: integer64; No user-defined `sfl` provided. Falling
## back to `character`.
## Warning: Couldn't find skimmers for class: integer64; No user-defined `sfl` provided. Falling
## back to `character`.
Dataset name: codebook_data
The dataset has N=57962283 rows and 45 columns. 28844414 rows have no missing values on any column.
|
#Variables
0 missing values.
name | data_type | n_missing | complete_rate | min | median | max | mean | sd | hist | label |
---|---|---|---|---|---|---|---|---|---|---|
adm_date | numeric | 0 | 1 | 1e+06 | 3112004 | 3.1e+07 | 8145601 | 8430949 | ▇▂▂▁▁ | NA |
0 missing values.
name | data_type | n_missing | complete_rate | min | median | max | mean | sd | hist | label |
---|---|---|---|---|---|---|---|---|---|---|
separation_cause | numeric | 0 | 1 | 0 | 0 | 80 | 4.412473 | 8.034775 | ▇▁▁▁▁ | NA |
## Error in `ggplot2::geom_histogram()`:
## ! Problem while computing position.
## ℹ Error occurred in the 1st layer.
## Caused by error in `if (...) NULL`:
## ! missing value where TRUE/FALSE needed
0 missing values.
name | data_type | n_missing | complete_rate | n_unique | empty | min | max | whitespace | label |
---|---|---|---|---|---|---|---|---|---|
cei | character | 0 | 1 | 29092 | 0 | 1 | 21 | 0 | NA |
0 missing values.
name | data_type | n_missing | complete_rate | min | median | max | mean | sd | hist | label |
---|---|---|---|---|---|---|---|---|---|---|
cnae95 | numeric | 0 | 1 | 1112 | 55212 | 99007 | 54321.33 | 24546.35 | ▃▂▆▇▂ | NA |
## Error in `ggplot2::geom_histogram()`:
## ! Problem while computing position.
## ℹ Error occurred in the 1st layer.
## Caused by error in `if (...) NULL`:
## ! missing value where TRUE/FALSE needed
0 missing values.
name | data_type | n_missing | complete_rate | n_unique | empty | min | max | whitespace | label |
---|---|---|---|---|---|---|---|---|---|
cpf | character | 0 | 1 | 36481343 | 0 | 1 | 21 | 0 | NA |
0 missing values.
name | data_type | n_missing | complete_rate | min | median | max | mean | sd | hist | label |
---|---|---|---|---|---|---|---|---|---|---|
termination_day | numeric | 0 | 1 | 0 | 0 | 30 | 4.542602 | 8.899851 | ▇▁▁▁▁ | NA |
0 missing values.
name | data_type | n_missing | complete_rate | min | median | max | mean | sd | hist | label |
---|---|---|---|---|---|---|---|---|---|---|
dob | numeric | 0 | 1 | 1e+06 | 1.5e+07 | 5.1e+07 | 15611021 | 8783104 | ▇▇▇▁▁ | NA |
0 missing values.
name | data_type | n_missing | complete_rate | min | median | max | mean | sd | hist | label |
---|---|---|---|---|---|---|---|---|---|---|
active1231 | numeric | 0 | 1 | 0 | 1 | 1 | 0.7018487 | 0.4574463 | ▃▁▁▁▇ | NA |
0 missing values.
name | data_type | n_missing | complete_rate | min | median | max | mean | sd | hist | label |
---|---|---|---|---|---|---|---|---|---|---|
schooling | numeric | 0 | 1 | 1 | 6 | 9 | 5.836569 | 2.029861 | ▂▃▃▇▃ | NA |
0 missing values.
name | data_type | n_missing | complete_rate | min | median | max | mean | sd | hist | label |
---|---|---|---|---|---|---|---|---|---|---|
hired_hours | numeric | 0 | 1 | 1 | 44 | 44 | 41.36232 | 6.224744 | ▁▁▁▁▇ | NA |
## Error in `ggplot2::geom_histogram()`:
## ! Problem while computing stat.
## ℹ Error occurred in the 1st layer.
## Caused by error in `seq_len()`:
## ! argument must be coercible to non-negative integer
0 missing values.
name | data_type | n_missing | complete_rate | n_unique | empty | min | max | whitespace | label |
---|---|---|---|---|---|---|---|---|---|
firmID | character | 0 | 1 | 2602198 | 0 | 15 | 21 | 0 | NA |
0 missing values.
name | data_type | n_missing | complete_rate | min | median | max | mean | sd | hist | label |
---|---|---|---|---|---|---|---|---|---|---|
alvara_ind | numeric | 0 | 1 | 1 | 2 | 2 | 1.997756 | 0.047314 | ▁▁▁▁▇ | NA |
0 missing values.
name | data_type | n_missing | complete_rate | min | median | max | mean | sd | hist | label |
---|---|---|---|---|---|---|---|---|---|---|
cei_avail | numeric | 0 | 1 | 0 | 0 | 1 | 0.0139683 | 0.1173592 | ▇▁▁▁▁ | NA |
0 missing values.
name | data_type | n_missing | complete_rate | min | median | max | mean | sd | hist | label |
---|---|---|---|---|---|---|---|---|---|---|
pat | numeric | 0 | 1 | 0 | 0 | 1 | 0.2662736 | 0.442009 | ▇▁▁▁▃ | NA |
0 missing values.
name | data_type | n_missing | complete_rate | min | median | max | mean | sd | hist | label |
---|---|---|---|---|---|---|---|---|---|---|
simples | numeric | 0 | 1 | 0 | 0 | 1 | 0.2134537 | 0.4097453 | ▇▁▁▁▂ | NA |
0 missing values.
name | data_type | n_missing | complete_rate | min | median | max | mean | sd | hist | label |
---|---|---|---|---|---|---|---|---|---|---|
termination_month | numeric | 0 | 1 | 0 | 0 | 12 | 2.05743 | 3.701357 | ▇▁▁▁▁ | NA |
0 missing values.
name | data_type | n_missing | complete_rate | min | median | max | mean | sd | hist | label |
---|---|---|---|---|---|---|---|---|---|---|
muni | numeric | 0 | 1 | 110001 | 353420 | 530010 | 349017.3 | 73691.98 | ▁▁▇▂▁ | NA |
0 missing values.
name | data_type | n_missing | complete_rate | min | median | max | mean | sd | hist | label |
---|---|---|---|---|---|---|---|---|---|---|
nationality | numeric | 0 | 1 | 10 | 10 | 50 | 10.0417 | 1.108595 | ▇▁▁▁▁ | NA |
0 missing values.
name | data_type | n_missing | complete_rate | min | median | max | mean | sd | hist | label |
---|---|---|---|---|---|---|---|---|---|---|
legal_form | numeric | 0 | 1 | 1015 | 2062 | 5002 | 2067.517 | 703.9957 | ▂▇▁▁▁ | NA |
0 missing values.
name | data_type | n_missing | complete_rate | n_unique | empty | min | max | whitespace | label |
---|---|---|---|---|---|---|---|---|---|
name | character | 0 | 1 | 27446271 | 0 | 2 | 30 | 0 | NA |
## Error in `ggplot2::geom_histogram()`:
## ! Problem while computing position.
## ℹ Error occurred in the 1st layer.
## Caused by error in `if (...) NULL`:
## ! missing value where TRUE/FALSE needed
0 missing values.
name | data_type | n_missing | complete_rate | n_unique | empty | min | max | whitespace | label |
---|---|---|---|---|---|---|---|---|---|
ctps | character | 0 | 1 | 28390651 | 0 | 1 | 21 | 0 | NA |
739096 missing values.
name | data_type | n_missing | complete_rate | min | median | max | mean | sd | hist | label |
---|---|---|---|---|---|---|---|---|---|---|
cbo02 | numeric | 739096 | 0.9872487 | 111215 | 514225 | 992225 | 530435 | 204784.4 | ▃▇▇▆▂ | NA |
4059792 missing values.
name | data_type | n_missing | complete_rate | min | median | max | mean | sd | hist | label |
---|---|---|---|---|---|---|---|---|---|---|
cbo94 | numeric | 4059792 | 0.929958 | 12110 | 55125 | 99190 | 56858.98 | 25926.23 | ▃▇▆▂▆ | NA |
## Error in `ggplot2::geom_histogram()`:
## ! Problem while computing position.
## ℹ Error occurred in the 1st layer.
## Caused by error in `if (...) NULL`:
## ! missing value where TRUE/FALSE needed
0 missing values.
name | data_type | n_missing | complete_rate | n_unique | empty | min | max | whitespace | label |
---|---|---|---|---|---|---|---|---|---|
pis | character | 0 | 1 | 37856735 | 0 | 1 | 21 | 0 | NA |
0 missing values.
name | data_type | n_missing | complete_rate | min | median | max | mean | sd | hist | label |
---|---|---|---|---|---|---|---|---|---|---|
disabled | numeric | 0 | 1 | 0 | 1 | 1 | 0.9945891 | 0.0733593 | ▁▁▁▁▇ | NA |
0 missing values.
name | data_type | n_missing | complete_rate | min | median | max | mean | sd | hist | label |
---|---|---|---|---|---|---|---|---|---|---|
race_color | numeric | 0 | 1 | 1 | 2 | 9 | 4.016953 | 2.883269 | ▇▁▁▁▃ | NA |
0 missing values.
name | data_type | n_missing | complete_rate | min | median | max | mean | sd | hist | label |
---|---|---|---|---|---|---|---|---|---|---|
cnpj_root | numeric | 0 | 1 | 0 | 1.7e+07 | 1e+08 | 29320251 | 29156106 | ▇▂▃▂▁ | NA |
0 missing values.
name | data_type | n_missing | complete_rate | min | median | max | mean | sd | hist | label |
---|---|---|---|---|---|---|---|---|---|---|
earn_dec | numeric | 0 | 1 | 0 | 458 | 39000 | 819.9083 | 1582.134 | ▇▁▁▁▁ | NA |
0 missing values.
name | data_type | n_missing | complete_rate | min | median | max | mean | sd | hist | label |
---|---|---|---|---|---|---|---|---|---|---|
earn_dec_mw | numeric | 0 | 1 | 0 | 1.8 | 150 | 3.146788 | 6.083618 | ▇▁▁▁▁ | NA |
0 missing values.
name | data_type | n_missing | complete_rate | min | median | max | mean | sd | hist | label |
---|---|---|---|---|---|---|---|---|---|---|
mean_earn | numeric | 0 | 1 | 0 | 562 | 39000 | 988.6142 | 1519.336 | ▇▁▁▁▁ | NA |
0 missing values.
name | data_type | n_missing | complete_rate | min | median | max | mean | sd | hist | label |
---|---|---|---|---|---|---|---|---|---|---|
mean_earn_mw | numeric | 0 | 1 | 0 | 2.2 | 150 | 3.896504 | 6.032453 | ▇▁▁▁▁ | NA |
0 missing values.
name | data_type | n_missing | complete_rate | min | median | max | mean | sd | hist | label |
---|---|---|---|---|---|---|---|---|---|---|
hired_wage | numeric | 0 | 1 | 0.01 | 430 | 1e+07 | 731.1121 | 11219.25 | ▇▁▁▁▁ | NA |
0 missing values.
name | data_type | n_missing | complete_rate | min | median | max | mean | sd | hist | label |
---|---|---|---|---|---|---|---|---|---|---|
sex | numeric | 0 | 1 | 0 | 0 | 1 | 0.3809112 | 0.4856108 | ▇▁▁▁▅ | NA |
0 missing values.
name | data_type | n_missing | complete_rate | min | median | max | mean | sd | hist | label |
---|---|---|---|---|---|---|---|---|---|---|
establishment_size | numeric | 0 | 1 | 0 | 5 | 9 | 5.282386 | 2.817664 | ▃▅▆▆▇ | NA |
0 missing values.
name | data_type | n_missing | complete_rate | min | median | max | mean | sd | hist | label |
---|---|---|---|---|---|---|---|---|---|---|
tenure | numeric | 0 | 1 | 0 | 20 | 600 | 49.59315 | 71.57092 | ▇▁▁▁▁ | NA |
0 missing values.
name | data_type | n_missing | complete_rate | min | median | max | mean | sd | hist | label |
---|---|---|---|---|---|---|---|---|---|---|
adm_type | numeric | 0 | 1 | 0 | 0 | 5 | 0.6695472 | 0.981976 | ▇▃▁▁▁ | NA |
0 missing values.
name | data_type | n_missing | complete_rate | min | median | max | mean | sd | hist | label |
---|---|---|---|---|---|---|---|---|---|---|
firmID_type | numeric | 0 | 1 | 0 | 1 | 1 | 0.9538345 | 0.2098434 | ▁▁▁▁▇ | NA |
0 missing values.
name | data_type | n_missing | complete_rate | min | median | max | mean | sd | hist | label |
---|---|---|---|---|---|---|---|---|---|---|
wage_type | numeric | 0 | 1 | 1 | 1 | 7 | 1.542992 | 1.395044 | ▇▁▁▁▁ | NA |
0 missing values.
name | data_type | n_missing | complete_rate | min | median | max | mean | sd | hist | label |
---|---|---|---|---|---|---|---|---|---|---|
contract_type | numeric | 0 | 1 | 10 | 10 | 97 | 16.46756 | 12.85777 | ▇▂▁▁▁ | NA |
0 missing values.
name | data_type | n_missing | complete_rate | min | median | max | mean | sd | hist | label |
---|---|---|---|---|---|---|---|---|---|---|
last_wage | numeric | 0 | 1 | 0 | 2.3 | 150 | 4.358719 | 7.768393 | ▇▁▁▁▁ | NA |
0 missing values.
name | data_type | n_missing | complete_rate | min | median | max | mean | sd | hist | label |
---|---|---|---|---|---|---|---|---|---|---|
yr | numeric | 0 | 1 | 2004 | 2004 | 2004 | 2004 | 0 | ▁▁▇▁▁ | NA |
0 missing values.
name | data_type | n_missing | complete_rate | min | median | max | mean | sd | hist | label |
---|---|---|---|---|---|---|---|---|---|---|
termination_year | numeric | 0 | 1 | 2004 | 2004 | 2004 | 2004 | 0 | ▁▁▇▁▁ | NA |
26630412 missing values.
name | data_type | n_missing | complete_rate | min | median | max | mean | sd | hist | label |
---|---|---|---|---|---|---|---|---|---|---|
adm_day | numeric | 26630412 | 0.5405562 | 1 | 13 | 31 | 13.73992 | 7.835947 | ▇▇▇▅▃ | NA |
4160723 missing values.
name | data_type | n_missing | complete_rate | min | median | max | mean | sd | hist | label |
---|---|---|---|---|---|---|---|---|---|---|
adm_month | numeric | 4160723 | 0.9282167 | 1 | 6 | 12 | 5.969088 | 3.182506 | ▇▅▅▆▅ | NA |
0 missing values.
name | data_type | n_missing | complete_rate | min | median | max | mean | sd | hist | label |
---|---|---|---|---|---|---|---|---|---|---|
adm_year | numeric | 0 | 1 | 1918 | 2003 | 2004 | 2000.25 | 5.900699 | ▁▁▁▁▇ | NA |
The following JSON-LD can be found by search engines, if you share this codebook publicly on the web.
{
"name": "codebook_data",
"datePublished": "2024-05-29",
"description": "The dataset has N=57962283 rows and 45 columns.\n28844414 rows have no missing values on any column.\n\n\n## Table of variables\nThis table contains variable names, labels, and number of missing values.\nSee the complete codebook for more.\n\n|name |label | n_missing|\n|:------------------|:-----|---------:|\n|adm_date |NA | 0|\n|separation_cause |NA | 0|\n|cei |NA | 0|\n|cnae95 |NA | 0|\n|cpf |NA | 0|\n|termination_day |NA | 0|\n|dob |NA | 0|\n|active1231 |NA | 0|\n|schooling |NA | 0|\n|hired_hours |NA | 0|\n|firmID |NA | 0|\n|alvara_ind |NA | 0|\n|cei_avail |NA | 0|\n|pat |NA | 0|\n|simples |NA | 0|\n|termination_month |NA | 0|\n|muni |NA | 0|\n|nationality |NA | 0|\n|legal_form |NA | 0|\n|name |NA | 0|\n|ctps |NA | 0|\n|cbo02 |NA | 739096|\n|cbo94 |NA | 4059792|\n|pis |NA | 0|\n|disabled |NA | 0|\n|race_color |NA | 0|\n|cnpj_root |NA | 0|\n|earn_dec |NA | 0|\n|earn_dec_mw |NA | 0|\n|mean_earn |NA | 0|\n|mean_earn_mw |NA | 0|\n|hired_wage |NA | 0|\n|sex |NA | 0|\n|establishment_size |NA | 0|\n|tenure |NA | 0|\n|adm_type |NA | 0|\n|firmID_type |NA | 0|\n|wage_type |NA | 0|\n|contract_type |NA | 0|\n|last_wage |NA | 0|\n|yr |NA | 0|\n|termination_year |NA | 0|\n|adm_day |NA | 26630412|\n|adm_month |NA | 4160723|\n|adm_year |NA | 0|\n\n### Note\nThis dataset was automatically described using the [codebook R package](https://rubenarslan.github.io/codebook/) (version 0.9.2).",
"keywords": ["adm_date", "separation_cause", "cei", "cnae95", "cpf", "termination_day", "dob", "active1231", "schooling", "hired_hours", "firmID", "alvara_ind", "cei_avail", "pat", "simples", "termination_month", "muni", "nationality", "legal_form", "name", "ctps", "cbo02", "cbo94", "pis", "disabled", "race_color", "cnpj_root", "earn_dec", "earn_dec_mw", "mean_earn", "mean_earn_mw", "hired_wage", "sex", "establishment_size", "tenure", "adm_type", "firmID_type", "wage_type", "contract_type", "last_wage", "yr", "termination_year", "adm_day", "adm_month", "adm_year"],
"@context": "http://schema.org/",
"@type": "Dataset",
"variableMeasured": [
{
"name": "adm_date",
"@type": "propertyValue"
},
{
"name": "separation_cause",
"@type": "propertyValue"
},
{
"name": "cei",
"@type": "propertyValue"
},
{
"name": "cnae95",
"@type": "propertyValue"
},
{
"name": "cpf",
"@type": "propertyValue"
},
{
"name": "termination_day",
"@type": "propertyValue"
},
{
"name": "dob",
"@type": "propertyValue"
},
{
"name": "active1231",
"@type": "propertyValue"
},
{
"name": "schooling",
"@type": "propertyValue"
},
{
"name": "hired_hours",
"@type": "propertyValue"
},
{
"name": "firmID",
"@type": "propertyValue"
},
{
"name": "alvara_ind",
"@type": "propertyValue"
},
{
"name": "cei_avail",
"@type": "propertyValue"
},
{
"name": "pat",
"@type": "propertyValue"
},
{
"name": "simples",
"@type": "propertyValue"
},
{
"name": "termination_month",
"@type": "propertyValue"
},
{
"name": "muni",
"@type": "propertyValue"
},
{
"name": "nationality",
"@type": "propertyValue"
},
{
"name": "legal_form",
"@type": "propertyValue"
},
{
"name": "name",
"@type": "propertyValue"
},
{
"name": "ctps",
"@type": "propertyValue"
},
{
"name": "cbo02",
"@type": "propertyValue"
},
{
"name": "cbo94",
"@type": "propertyValue"
},
{
"name": "pis",
"@type": "propertyValue"
},
{
"name": "disabled",
"@type": "propertyValue"
},
{
"name": "race_color",
"@type": "propertyValue"
},
{
"name": "cnpj_root",
"@type": "propertyValue"
},
{
"name": "earn_dec",
"@type": "propertyValue"
},
{
"name": "earn_dec_mw",
"@type": "propertyValue"
},
{
"name": "mean_earn",
"@type": "propertyValue"
},
{
"name": "mean_earn_mw",
"@type": "propertyValue"
},
{
"name": "hired_wage",
"@type": "propertyValue"
},
{
"name": "sex",
"@type": "propertyValue"
},
{
"name": "establishment_size",
"@type": "propertyValue"
},
{
"name": "tenure",
"@type": "propertyValue"
},
{
"name": "adm_type",
"@type": "propertyValue"
},
{
"name": "firmID_type",
"@type": "propertyValue"
},
{
"name": "wage_type",
"@type": "propertyValue"
},
{
"name": "contract_type",
"@type": "propertyValue"
},
{
"name": "last_wage",
"@type": "propertyValue"
},
{
"name": "yr",
"@type": "propertyValue"
},
{
"name": "termination_year",
"@type": "propertyValue"
},
{
"name": "adm_day",
"@type": "propertyValue"
},
{
"name": "adm_month",
"@type": "propertyValue"
},
{
"name": "adm_year",
"@type": "propertyValue"
}
]
}`
# } # end year loop