Swap rows by columns with duplicate values

2

I am working with the following data:

data <- data.frame(id=c(1811,1811,1522,7121),
                   pregunta=c("17", "17", "20", "21"),
                   valor=c("13","2","12","15"))

    id pregunta valor
1811       17    13
1811       17     2
1522       20    12
7121       21    15

And I would like to make a transformation to obtain:

     id p17.1 p17.2 p20 p21
1: 1811    13     2  NA  NA
2: 1522    NA    NA  12  NA
3: 7121    NA    NA  NA  21

Also, if the example were this:

data <- data.frame(id = c(1811, 1811, 1522,1366, 7121),
                   pregunta = c("17", "17","20","21", "21"),
                   valor = c("13", "2", "12","21", "15"))

The desired solution would be:

id p17.1 p17.2 p20 p21
1811    13     2  NA  NA
1522    NA    NA  12  NA
1366    NA    NA  NA  21
7121    NA    NA  NA  15

I use:

library(tidyverse)
data %>% 
     gather(valor, -'id', -pregunta) %>% 
     spread(pregunta, valor)

But having duplicate values I get the following error:

Error: Duplicate identifiers for rows (1, 2)
In addition: Warning message:
attributes are not identical across measure variables;
they will be dropped

I am interested in creating as many variables as the variable has Question, the problem is that when you repeat values of Question, you give that error.

I have tried other ways but in the end I always get to the problem of duplicate values.

    
asked by Uko 27.03.2018 в 09:38
source

2 answers

2

Undoubtedly the answer of @jbkunst is indicated, but as you did not mention that specifically the solution is by means of dplyr I dare to give you an alternative with the basic functionality of R :

Using reshape() :

data$no <- paste0(data$pregunta, ".", ave(data$id, paste(data$id,data$pregunta), FUN = seq_along))
new.data <- reshape(data[,-2], direction = "wide", idvar=c("id"), timevar = "no")
names(new.data) = gsub("valor.", "p", names(new.data))
new.data

    id p17.1 p17.2 p20.1 p21.1
1 1811    13     2  <NA>  <NA>
3 1522  <NA>  <NA>    12  <NA>
4 7121  <NA>  <NA>  <NA>    15

Or with this example:

data <- data.frame(id = c(1811, 1811, 1522,1366, 7121), 
                   pregunta = c("17", "17","20","21", "21"), 
                   valor = c("13", "2", "12","21", "15")) 

We would get:

    id p17.1 p17.2 p20.1 p21.1
1 1811    13     2  <NA>  <NA>
3 1522  <NA>  <NA>    12  <NA>
4 1366  <NA>  <NA>  <NA>    21
5 7121  <NA>  <NA>  <NA>    15

With: paste0(data$pregunta, ".", ave(data$id, paste(data$id,data$pregunta), FUN = seq_along)) we generate a new column that represents a unique id by id / question / value, numbering each element of the group, so for question 17 we will have a 17_1 and another 17_2 . Then simply with reshape() we transform the data to a wide format. And finally we simply normalize the names of the columns to something more similar to what you were looking for.

    
answered by 27.03.2018 в 16:36
2

It is relatively easy if you first create an identifier within each group "question", how can this be done? Through a combination of group_by , mutate and the auxiliary functions row_number and unite . After this you create a unique identifier what you can use with spread.


library(tidyverse)

data <- data.frame(id = c(1811, 1811, 1522, 7121), pregunta = c("17", "17", 
  "20", "21"), valor = c("13", "2", "12", "15"))

data_con_id <- data %>%
  group_by(pregunta) %>%
  mutate(fila = row_number()) %>% 
  unite("preg.fila", pregunta, fila)
data_con_id
#> # A tibble: 4 x 3
#>      id preg.fila valor
#>   <dbl> <chr>     <fct>
#> 1 1811. 17_1      13   
#> 2 1811. 17_2      2    
#> 3 1522. 20_1      12   
#> 4 7121. 21_1      15

data_con_id %>%
  spread(preg.fila, valor)
#> # A tibble: 3 x 5
#>      id '17_1' '17_2' '20_1' '21_1'
#>   <dbl> <fct>  <fct>  <fct>  <fct> 
#> 1 1522. <NA>   <NA>   12     <NA>  
#> 2 1811. 13     2      <NA>   <NA>  
#> 3 7121. <NA>   <NA>   <NA>   15

Update.

In case you do not want the _1 you could use the auxiliary function n() which counts the number of elements in the group. After those, you can condition with ifelse the format of the name of the comluna in case the number of elements per question is 1 or not.


data_con_id <- data %>%
  group_by(pregunta) %>%
  mutate(
    fila = row_number(),
    cantidad_de_filas = n()
  ) %>% 
  ungroup()
data_con_id
#> # A tibble: 4 x 5
#>      id pregunta valor  fila cantidad_de_filas
#>   <dbl> <fct>    <fct> <int>             <int>
#> 1 1811. 17       13        1                 2
#> 2 1811. 17       2         2                 2
#> 3 1522. 20       12        1                 1
#> 4 7121. 21       15        1                 1


data_con_id <- data_con_id %>%
  mutate(preg_fila = ifelse(
    cantidad_de_filas == 1,
    as.character(pregunta),
    paste(pregunta, fila, sep = "_")
    )
  )
data_con_id
#> # A tibble: 4 x 6
#>      id pregunta valor  fila cantidad_de_filas preg_fila
#>   <dbl> <fct>    <fct> <int>             <int> <chr>    
#> 1 1811. 17       13        1                 2 17_1     
#> 2 1811. 17       2         2                 2 17_2     
#> 3 1522. 20       12        1                 1 20       
#> 4 7121. 21       15        1                 1 21

data_con_id %>%
  select(id, valor, preg_fila) %>%
  spread(preg_fila, valor)
#> # A tibble: 3 x 5
#>      id '17_1' '17_2' '20'  '21' 
#>   <dbl> <fct>  <fct>  <fct> <fct>
#> 1 1522. <NA>   <NA>   12    <NA> 
#> 2 1811. 13     2      <NA>  <NA> 
#> 3 7121. <NA>   <NA>   <NA>  15
    
answered by 27.03.2018 в 15:15