knitr::opts_chunk$set(
  warning = FALSE, # show warnings during codebook generation
  message = FALSE, # show messages during codebook generation
  error = TRUE, # do not interrupt codebook generation in case of errors,
                # usually better for debugging
  echo = TRUE  # show R code
)
ggplot2::theme_set(ggplot2::theme_bw())

# set base directory
basedir = "/home/ecco_rais/data/clean/RAIS-homogenized/output/"
# adjust as necessary
startyear = 2003
endyear = 2004

# libraries
library(codebook)
library(rio)

# Start the codebook loop

#for ( year in startyear:endyear) {
year = "2006"
    # for CSV
    codebook_data <- fread(file.path(basedir, paste0("harmo_", year, ".csv")))

    # omit the following lines, if your missing values are already properly labelled
    codebook_data <- detect_missing(codebook_data,
        only_labelled = TRUE, # only labelled values are autodetected as
                              # missing
        negative_values_are_missing = FALSE, # negative values are missing values
        ninety_nine_problems = TRUE,   # 99/999 are missing values, if they
                                      # are more than 5 MAD from the median
    )

    # If you are not using formr, the codebook package needs to guess which items
    # form a scale. The following line finds item aggregates with names like this:
    # scale = scale_1 + scale_2R + scale_3R
    # identifying these aggregates allows the codebook function to
    # automatically compute reliabilities.
    # However, it will not reverse items automatically.
    codebook_data <- detect_scales(codebook_data)
## Warning in detect_scales(codebook_data): cnae items found, but no aggregate
## Warning in detect_scales(codebook_data): active items found, but no aggregate
## Warning in detect_scales(codebook_data): cbo items found, but no aggregate
    codebook_data <- as.data.table(codebook_data)
    codebook(codebook_data)
## Warning in max(f): no non-missing arguments to max; returning -Inf
## Warning: Couldn't find skimmers for class: integer64; No user-defined `sfl` provided. Falling
## back to `character`.

## Warning: Couldn't find skimmers for class: integer64; No user-defined `sfl` provided. Falling
## back to `character`.

## Warning: Couldn't find skimmers for class: integer64; No user-defined `sfl` provided. Falling
## back to `character`.
## Warning in max(f): no non-missing arguments to max; returning -Inf
## Warning: Couldn't find skimmers for class: integer64; No user-defined `sfl` provided. Falling
## back to `character`.

## Warning: Couldn't find skimmers for class: integer64; No user-defined `sfl` provided. Falling
## back to `character`.

## Warning: Couldn't find skimmers for class: integer64; No user-defined `sfl` provided. Falling
## back to `character`.

## Warning: Couldn't find skimmers for class: integer64; No user-defined `sfl` provided. Falling
## back to `character`.

## Warning: Couldn't find skimmers for class: integer64; No user-defined `sfl` provided. Falling
## back to `character`.

## Warning: Couldn't find skimmers for class: integer64; No user-defined `sfl` provided. Falling
## back to `character`.

## Warning: Couldn't find skimmers for class: integer64; No user-defined `sfl` provided. Falling
## back to `character`.

## Warning: Couldn't find skimmers for class: integer64; No user-defined `sfl` provided. Falling
## back to `character`.

## Warning: Couldn't find skimmers for class: integer64; No user-defined `sfl` provided. Falling
## back to `character`.

Metadata

Description

Dataset name: codebook_data

The dataset has N=50701027 rows and 48 columns. 20369153 rows have no missing values on any column.

Metadata for search engines
  • Date published: 2024-05-30
x
adm_date
separation_cause
cei
cnae20
cnae95
cpf
termination_day
dob
active1231
sex
schooling
hired_hours
firmID
alvara_ind
cei_avail
pat
simples
termination_month
muni
nationality
legal_form
name
ctps
cbo02
cbo94
pis
disabled
race_color
cnpj_root
earn_dec
earn_dec_mw
mean_earn
mean_earn_mw
hired_wage
cnae20sub
establishment_size
tenure
adm_type
firmID_type
wage_type
disability_type
contract_type
last_wage
yr
termination_year
adm_day
adm_month
adm_year

#Variables

adm_date

Distribution

Distribution of values for adm_date

Distribution of values for adm_date

0 missing values.

Summary statistics

name data_type n_missing complete_rate min median max mean sd hist label
adm_date numeric 0 1 1e+06 3102005 3.1e+07 8046676 8413408 ▇▂▂▁▁ NA

separation_cause

Distribution

Distribution of values for separation_cause

Distribution of values for separation_cause

0 missing values.

Summary statistics

name data_type n_missing complete_rate min median max mean sd hist label
separation_cause numeric 0 1 0 0 80 4.544575 8.073679 ▇▁▁▁▁ NA

cei

Distribution

## Error in `ggplot2::geom_histogram()`:
## ! Problem while computing position.
## ℹ Error occurred in the 1st layer.
## Caused by error in `if (...) NULL`:
## ! missing value where TRUE/FALSE needed

0 missing values.

Summary statistics

name data_type n_missing complete_rate n_unique empty min max whitespace label
cei character 0 1 33023 0 1 21 0 NA

cnae20

Distribution

Distribution of values for cnae20

Distribution of values for cnae20

0 missing values.

Summary statistics

name data_type n_missing complete_rate min median max mean sd hist label
cnae20 numeric 0 1 1113 49213 99008 54327.63 27617.83 ▃▂▇▂▇ NA

cnae95

Distribution

Distribution of values for cnae95

Distribution of values for cnae95

0 missing values.

Summary statistics

name data_type n_missing complete_rate min median max mean sd hist label
cnae95 numeric 0 1 1112 55131 99007 54356.51 24357.29 ▃▂▆▇▂ NA

cpf

Distribution

## Error in `ggplot2::geom_histogram()`:
## ! Problem while computing position.
## ℹ Error occurred in the 1st layer.
## Caused by error in `if (...) NULL`:
## ! missing value where TRUE/FALSE needed

0 missing values.

Summary statistics

name data_type n_missing complete_rate n_unique empty min max whitespace label
cpf character 0 1 41697555 0 1 21 0 NA

termination_day

Distribution

Distribution of values for termination_day

Distribution of values for termination_day

0 missing values.

Summary statistics

name data_type n_missing complete_rate n_unique empty min max whitespace label
termination_day character 0 1 31 0 2 12 0 NA

dob

Distribution

Distribution of values for dob

Distribution of values for dob

0 missing values.

Summary statistics

name data_type n_missing complete_rate min median max mean sd hist label
dob numeric 0 1 1e+06 1.5e+07 5.1e+07 15621671 8784753 ▇▇▇▁▁ NA

active1231

Distribution

Distribution of values for active1231

Distribution of values for active1231

0 missing values.

Summary statistics

name data_type n_missing complete_rate min median max mean sd hist label
active1231 numeric 0 1 0 1 1 0.6933834 0.4610888 ▃▁▁▁▇ NA

sex

Distribution

Distribution of values for sex

Distribution of values for sex

0 missing values.

Summary statistics

name data_type n_missing complete_rate min median max mean sd hist label
sex numeric 0 1 0 0 1 0.3883998 0.4873863 ▇▁▁▁▅ NA

schooling

Distribution

Distribution of values for schooling

Distribution of values for schooling

0 missing values.

Summary statistics

name data_type n_missing complete_rate min median max mean sd hist label
schooling numeric 0 1 1 7 11 5.996219 1.978101 ▂▅▇▃▁ NA

hired_hours

Distribution

Distribution of values for hired_hours

Distribution of values for hired_hours

0 missing values.

Summary statistics

name data_type n_missing complete_rate min median max mean sd hist label
hired_hours numeric 0 1 1 44 44 41.2524 6.372872 ▁▁▁▁▇ NA

firmID

Distribution

## Error in `ggplot2::geom_histogram()`:
## ! Problem while computing stat.
## ℹ Error occurred in the 1st layer.
## Caused by error in `seq_len()`:
## ! argument must be coercible to non-negative integer

0 missing values.

Summary statistics

name data_type n_missing complete_rate n_unique empty min max whitespace label
firmID character 0 1 2805601 0 15 21 0 NA

alvara_ind

Distribution

Distribution of values for alvara_ind

Distribution of values for alvara_ind

0 missing values.

Summary statistics

name data_type n_missing complete_rate min median max mean sd hist label
alvara_ind numeric 0 1 1 2 2 1.998061 0.0439917 ▁▁▁▁▇ NA

cei_avail

Distribution

Distribution of values for cei_avail

Distribution of values for cei_avail

0 missing values.

Summary statistics

name data_type n_missing complete_rate min median max mean sd hist label
cei_avail numeric 0 1 0 0 1 0.0168996 0.1288953 ▇▁▁▁▁ NA

pat

Distribution

Distribution of values for pat

Distribution of values for pat

0 missing values.

Summary statistics

name data_type n_missing complete_rate min median max mean sd hist label
pat numeric 0 1 0 0 1 0.2642605 0.4409387 ▇▁▁▁▃ NA

simples

Distribution

Distribution of values for simples

Distribution of values for simples

0 missing values.

Summary statistics

name data_type n_missing complete_rate min median max mean sd hist label
simples numeric 0 1 0 0 1 0.2200727 0.4142954 ▇▁▁▁▂ NA

termination_month

Distribution

Distribution of values for termination_month

Distribution of values for termination_month

0 missing values.

Summary statistics

name data_type n_missing complete_rate min median max mean sd hist label
termination_month numeric 0 1 0 0 12 2.074017 3.680925 ▇▁▁▁▁ NA

muni

Distribution

Distribution of values for muni

Distribution of values for muni

0 missing values.

Summary statistics

name data_type n_missing complete_rate min median max mean sd hist label
muni numeric 0 1 110001 351880 530010 346118.3 84011.67 ▁▂▇▂▁ NA

nationality

Distribution

Distribution of values for nationality

Distribution of values for nationality

0 missing values.

Summary statistics

name data_type n_missing complete_rate min median max mean sd hist label
nationality numeric 0 1 10 10 50 10.03237 0.9783084 ▇▁▁▁▁ NA

name

Distribution

Distribution of values for name

Distribution of values for name

0 missing values.

Summary statistics

name data_type n_missing complete_rate n_unique empty min max whitespace label
name character 0 1 30661092 0 4 30 0 NA

ctps

Distribution

Distribution of values for ctps

Distribution of values for ctps

0 missing values.

Summary statistics

name data_type n_missing complete_rate min median max mean sd hist label
ctps numeric 0 1 0 51962 1e+08 670518.6 3993270 ▇▁▁▁▁ NA

cbo02

Distribution

Distribution of values for cbo02

Distribution of values for cbo02

641422 missing values.

Summary statistics

name data_type n_missing complete_rate min median max mean sd hist label
cbo02 numeric 641422 0.9873489 111215 514225 992225 529052.6 206124.8 ▃▇▇▆▂ NA

cbo94

Distribution

Distribution of values for cbo94

Distribution of values for cbo94

3791092 missing values.

Summary statistics

name data_type n_missing complete_rate min median max mean sd hist label
cbo94 numeric 3791092 0.9252265 12110 55120 99190 56609.29 26168.9 ▃▇▆▂▆ NA

pis

Distribution

## Error in `ggplot2::geom_histogram()`:
## ! Problem while computing stat.
## ℹ Error occurred in the 1st layer.
## Caused by error in `seq_len()`:
## ! argument must be coercible to non-negative integer

0 missing values.

Summary statistics

name data_type n_missing complete_rate n_unique empty min max whitespace label
pis character 0 1 42486868 0 13 21 0 NA

disabled

Distribution

Distribution of values for disabled

Distribution of values for disabled

0 missing values.

Summary statistics

name data_type n_missing complete_rate min median max mean sd hist label
disabled numeric 0 1 0 1 1 0.9761869 0.1524667 ▁▁▁▁▇ NA

race_color

Distribution

Distribution of values for race_color

Distribution of values for race_color

8195031 missing values.

Summary statistics

name data_type n_missing complete_rate min median max mean sd hist label
race_color numeric 8195031 0.8383656 1 2 9 3.955704 2.768923 ▇▁▁▁▃ NA

cnpj_root

Distribution

Distribution of values for cnpj_root

Distribution of values for cnpj_root

0 missing values.

Summary statistics

name data_type n_missing complete_rate min median max mean sd hist label
cnpj_root numeric 0 1 0 8424178 1e+08 26513603 28998582 ▇▂▂▂▁ NA

earn_dec

Distribution

Distribution of values for earn_dec

Distribution of values for earn_dec

0 missing values.

Summary statistics

name data_type n_missing complete_rate min median max mean sd hist label
earn_dec numeric 0 1 0 500 52503 895.4861 1810.114 ▇▁▁▁▁ NA

earn_dec_mw

Distribution

Distribution of values for earn_dec_mw

Distribution of values for earn_dec_mw

0 missing values.

Summary statistics

name data_type n_missing complete_rate min median max mean sd hist label
earn_dec_mw numeric 0 1 0 1.4 150 2.555371 5.170967 ▇▁▁▁▁ NA

mean_earn

Distribution

Distribution of values for mean_earn

Distribution of values for mean_earn

0 missing values.

Summary statistics

name data_type n_missing complete_rate min median max mean sd hist label
mean_earn numeric 0 1 0 616 52500 1082.04 1686.269 ▇▁▁▁▁ NA

mean_earn_mw

Distribution

Distribution of values for mean_earn_mw

Distribution of values for mean_earn_mw

0 missing values.

Summary statistics

name data_type n_missing complete_rate min median max mean sd hist label
mean_earn_mw numeric 0 1 0 1.8 150 3.218073 5.063303 ▇▁▁▁▁ NA

hired_wage

Distribution

Distribution of values for hired_wage

Distribution of values for hired_wage

0 missing values.

Summary statistics

name data_type n_missing complete_rate min median max mean sd hist label
hired_wage numeric 0 1 0.01 480 1e+07 799.4049 8607.02 ▇▁▁▁▁ NA

cnae20sub

Distribution

Distribution of values for cnae20sub

Distribution of values for cnae20sub

0 missing values.

Summary statistics

name data_type n_missing complete_rate min median max mean sd hist label
cnae20sub numeric 0 1 -1 4789099 9900800 5320712 2812967 ▅▂▇▂▇ NA

establishment_size

Distribution

Distribution of values for establishment_size

Distribution of values for establishment_size

0 missing values.

Summary statistics

name data_type n_missing complete_rate min median max mean sd hist label
establishment_size numeric 0 1 0 5 9 5.34562 2.831424 ▃▅▆▅▇ NA

tenure

Distribution

Distribution of values for tenure

Distribution of values for tenure

0 missing values.

Summary statistics

name data_type n_missing complete_rate min median max mean sd hist label
tenure numeric 0 1 0 18 600 49.86951 74.32097 ▇▁▁▁▁ NA

adm_type

Distribution

Distribution of values for adm_type

Distribution of values for adm_type

0 missing values.

Summary statistics

name data_type n_missing complete_rate min median max mean sd hist label
adm_type numeric 0 1 0 0 8 0.67417 0.9910125 ▇▃▁▁▁ NA

firmID_type

Distribution

Distribution of values for firmID_type

Distribution of values for firmID_type

0 missing values.

Summary statistics

name data_type n_missing complete_rate min median max mean sd hist label
firmID_type numeric 0 1 0 1 1 0.9568351 0.2032283 ▁▁▁▁▇ NA

wage_type

Distribution

Distribution of values for wage_type

Distribution of values for wage_type

0 missing values.

Summary statistics

name data_type n_missing complete_rate min median max mean sd hist label
wage_type numeric 0 1 1 1 7 1.460053 1.297719 ▇▁▁▁▁ NA

disability_type

Distribution

Distribution of values for disability_type

Distribution of values for disability_type

0 missing values.

Summary statistics

name data_type n_missing complete_rate min median max mean sd hist label
disability_type numeric 0 1 -1 -1 6 -0.9260231 0.5075041 ▇▁▁▁▁ NA

contract_type

Distribution

Distribution of values for contract_type

Distribution of values for contract_type

0 missing values.

Summary statistics

name data_type n_missing complete_rate min median max mean sd hist label
contract_type numeric 0 1 10 10 97 16.70103 13.51025 ▇▂▁▁▁ NA

last_wage

Distribution

Distribution of values for last_wage

Distribution of values for last_wage

0 missing values.

Summary statistics

name data_type n_missing complete_rate min median max mean sd hist label
last_wage numeric 0 1 0 1.8 150 3.506903 6.402573 ▇▁▁▁▁ NA

yr

Distribution

Distribution of values for yr

Distribution of values for yr

0 missing values.

Summary statistics

name data_type n_missing complete_rate min median max mean sd hist label
yr numeric 0 1 2006 2006 2006 2006 0 ▁▁▇▁▁ NA

termination_year

Distribution

Distribution of values for termination_year

Distribution of values for termination_year

0 missing values.

Summary statistics

name data_type n_missing complete_rate min median max mean sd hist label
termination_year numeric 0 1 2006 2006 2006 2006 0 ▁▁▇▁▁ NA

adm_day

Distribution

Distribution of values for adm_day

Distribution of values for adm_day

23617558 missing values.

Summary statistics

name data_type n_missing complete_rate min median max mean sd hist label
adm_day numeric 23617558 0.5341799 1 13 31 13.75036 7.790802 ▇▇▇▅▂ NA

adm_month

Distribution

Distribution of values for adm_month

Distribution of values for adm_month

3647199 missing values.

Summary statistics

name data_type n_missing complete_rate min median max mean sd hist label
adm_month numeric 3647199 0.9280646 1 6 12 5.938321 3.208278 ▇▅▅▅▅ NA

adm_year

Distribution

Distribution of values for adm_year

Distribution of values for adm_year

0 missing values.

Summary statistics

name data_type n_missing complete_rate min median max mean sd hist label
adm_year numeric 0 1 1925 2005 2006 2002.226 6.128214 ▁▁▁▁▇ NA

Missingness report

Codebook table

JSON-LD metadata

The following JSON-LD can be found by search engines, if you share this codebook publicly on the web.

{
  "name": "codebook_data",
  "datePublished": "2024-05-30",
  "description": "The dataset has N=50701027 rows and 48 columns.\n20369153 rows have no missing values on any column.\n\n\n## Table of variables\nThis table contains variable names, labels, and number of missing values.\nSee the complete codebook for more.\n\n|name               |label | n_missing|\n|:------------------|:-----|---------:|\n|adm_date           |NA    |         0|\n|separation_cause   |NA    |         0|\n|cei                |NA    |         0|\n|cnae20             |NA    |         0|\n|cnae95             |NA    |         0|\n|cpf                |NA    |         0|\n|termination_day    |NA    |         0|\n|dob                |NA    |         0|\n|active1231         |NA    |         0|\n|sex                |NA    |         0|\n|schooling          |NA    |         0|\n|hired_hours        |NA    |         0|\n|firmID             |NA    |         0|\n|alvara_ind         |NA    |         0|\n|cei_avail          |NA    |         0|\n|pat                |NA    |         0|\n|simples            |NA    |         0|\n|termination_month  |NA    |         0|\n|muni               |NA    |         0|\n|nationality        |NA    |         0|\n|legal_form         |NA    |         0|\n|name               |NA    |         0|\n|ctps               |NA    |         0|\n|cbo02              |NA    |    641422|\n|cbo94              |NA    |   3791092|\n|pis                |NA    |         0|\n|disabled           |NA    |         0|\n|race_color         |NA    |   8195031|\n|cnpj_root          |NA    |         0|\n|earn_dec           |NA    |         0|\n|earn_dec_mw        |NA    |         0|\n|mean_earn          |NA    |         0|\n|mean_earn_mw       |NA    |         0|\n|hired_wage         |NA    |         0|\n|cnae20sub          |NA    |         0|\n|establishment_size |NA    |         0|\n|tenure             |NA    |         0|\n|adm_type           |NA    |         0|\n|firmID_type        |NA    |         0|\n|wage_type          |NA    |         0|\n|disability_type    |NA    |         0|\n|contract_type      |NA    |         0|\n|last_wage          |NA    |         0|\n|yr                 |NA    |         0|\n|termination_year   |NA    |         0|\n|adm_day            |NA    |  23617558|\n|adm_month          |NA    |   3647199|\n|adm_year           |NA    |         0|\n\n### Note\nThis dataset was automatically described using the [codebook R package](https://rubenarslan.github.io/codebook/) (version 0.9.2).",
  "keywords": ["adm_date", "separation_cause", "cei", "cnae20", "cnae95", "cpf", "termination_day", "dob", "active1231", "sex", "schooling", "hired_hours", "firmID", "alvara_ind", "cei_avail", "pat", "simples", "termination_month", "muni", "nationality", "legal_form", "name", "ctps", "cbo02", "cbo94", "pis", "disabled", "race_color", "cnpj_root", "earn_dec", "earn_dec_mw", "mean_earn", "mean_earn_mw", "hired_wage", "cnae20sub", "establishment_size", "tenure", "adm_type", "firmID_type", "wage_type", "disability_type", "contract_type", "last_wage", "yr", "termination_year", "adm_day", "adm_month", "adm_year"],
  "@context": "http://schema.org/",
  "@type": "Dataset",
  "variableMeasured": [
    {
      "name": "adm_date",
      "@type": "propertyValue"
    },
    {
      "name": "separation_cause",
      "@type": "propertyValue"
    },
    {
      "name": "cei",
      "@type": "propertyValue"
    },
    {
      "name": "cnae20",
      "@type": "propertyValue"
    },
    {
      "name": "cnae95",
      "@type": "propertyValue"
    },
    {
      "name": "cpf",
      "@type": "propertyValue"
    },
    {
      "name": "termination_day",
      "@type": "propertyValue"
    },
    {
      "name": "dob",
      "@type": "propertyValue"
    },
    {
      "name": "active1231",
      "@type": "propertyValue"
    },
    {
      "name": "sex",
      "@type": "propertyValue"
    },
    {
      "name": "schooling",
      "@type": "propertyValue"
    },
    {
      "name": "hired_hours",
      "@type": "propertyValue"
    },
    {
      "name": "firmID",
      "@type": "propertyValue"
    },
    {
      "name": "alvara_ind",
      "@type": "propertyValue"
    },
    {
      "name": "cei_avail",
      "@type": "propertyValue"
    },
    {
      "name": "pat",
      "@type": "propertyValue"
    },
    {
      "name": "simples",
      "@type": "propertyValue"
    },
    {
      "name": "termination_month",
      "@type": "propertyValue"
    },
    {
      "name": "muni",
      "@type": "propertyValue"
    },
    {
      "name": "nationality",
      "@type": "propertyValue"
    },
    {
      "name": "legal_form",
      "@type": "propertyValue"
    },
    {
      "name": "name",
      "@type": "propertyValue"
    },
    {
      "name": "ctps",
      "@type": "propertyValue"
    },
    {
      "name": "cbo02",
      "@type": "propertyValue"
    },
    {
      "name": "cbo94",
      "@type": "propertyValue"
    },
    {
      "name": "pis",
      "@type": "propertyValue"
    },
    {
      "name": "disabled",
      "@type": "propertyValue"
    },
    {
      "name": "race_color",
      "@type": "propertyValue"
    },
    {
      "name": "cnpj_root",
      "@type": "propertyValue"
    },
    {
      "name": "earn_dec",
      "@type": "propertyValue"
    },
    {
      "name": "earn_dec_mw",
      "@type": "propertyValue"
    },
    {
      "name": "mean_earn",
      "@type": "propertyValue"
    },
    {
      "name": "mean_earn_mw",
      "@type": "propertyValue"
    },
    {
      "name": "hired_wage",
      "@type": "propertyValue"
    },
    {
      "name": "cnae20sub",
      "@type": "propertyValue"
    },
    {
      "name": "establishment_size",
      "@type": "propertyValue"
    },
    {
      "name": "tenure",
      "@type": "propertyValue"
    },
    {
      "name": "adm_type",
      "@type": "propertyValue"
    },
    {
      "name": "firmID_type",
      "@type": "propertyValue"
    },
    {
      "name": "wage_type",
      "@type": "propertyValue"
    },
    {
      "name": "disability_type",
      "@type": "propertyValue"
    },
    {
      "name": "contract_type",
      "@type": "propertyValue"
    },
    {
      "name": "last_wage",
      "@type": "propertyValue"
    },
    {
      "name": "yr",
      "@type": "propertyValue"
    },
    {
      "name": "termination_year",
      "@type": "propertyValue"
    },
    {
      "name": "adm_day",
      "@type": "propertyValue"
    },
    {
      "name": "adm_month",
      "@type": "propertyValue"
    },
    {
      "name": "adm_year",
      "@type": "propertyValue"
    }
  ]
}`
 # } # end year loop