简体   繁体   中英

Trying to loop through dataframe and append a value to a list, but for loops aren't working

Here's a brief look at my data

  X      name sex X1880 X1881
1 1      Mary   F  7065  6919
2 2      Anna   F  2604  2698
3 3      Emma   F  2003  2034
4 4 Elizabeth   F  1939  1852
5 5    Minnie   F  1746  1653

Each "X----" represents a year (up to 2010), the column "name" represents a unique name for a child, and so the corresponding number between any name and year is the number of children born in year "X---" with the specified name (for example, there were 7065 Marys born in 1880).

I would like to loop through columns covering the years 1931 to 2010, find the total number of children born in that year, and then find the total number of children born in that year whose name begins with each letter of the alphabet. Finally, I would like to get the percent of children born in each year whose name begins with each letter, and store it to a list so I can plot trend lines for all letters/all years on the same graph.

Here is the code I have

allnames <- read.csv("SSA-longtail-names.csv")
girls <- subset(allnames, allnames$sex=="F")
year_columns <- as.vector(names(girls)[54:134])


percs <- list()

years <- length(year_columns)
letters <- length(LETTERS)

for (i in range(1:years)){
  total = sum(girls[year_columns[i]])
  for (n in range(1:letters)){
    l <- toString(LETTERS[n])
    sub <- girls[(grep(l, girls$name)),year_columns[i]]
    sub_total <- sum(sub[year_columns[i]])
    percent <- (sub_total / total) * 100
    percs <- append(percs, percent)
  }
}

But the for loops only go through 8 iterations, and the list percs (which is supposed to store the calculated percentages) is full of NAs. Can anyone suggest a way to fix these loops, or perhaps an even easier way to accomplish this task?

Here is an approach using dplyr , tidyr , and stringr to make a long data table by pivoting your year columns.

library(dplyr)
library(tidyr)
library(stringr)
data2 <- data %>% 
  pivot_longer(cols = c(-X, -name, -sex), names_to = "year", values_to = "births") %>%
  complete.cases() %>%  # remove NA rows
  mutate(year = as.integer(str_remove(year, "X")), 
         first_letter = str_sub(name, start = 1, end = 1) %>%
  filter(year >= 1931 & year <= 2010)

Now you can do something like:

data3 <- data2 %>%
  group_by(first_letter, year) %>%
  summarize(total = sum(births))

This gives you a data.frame of three columns:

first_letter   year   total
A              1880   17972
A              1881   16426
# etc.

Now you can do some plotting, for example with ggplot2

library(ggplot2)
# this only looks at the English vowels to make a manageable example
ggplot(data = data3 %>% filter(first_letter %in% c("A", "E", "I", "O", "U"), 
       aes(x = year, y = total, color = first_letter)) +
  geom_line()

  

I've split the solution into the three parts you describe. If you are only after the percentages, you can ignore the first part (total) and combine the second and third:

library(dplyr)
library(stringr)
library(tidyr)

data <- tibble(name = c('Mary', 'Anna', 'Emma', 'Elizabeth', 'Minnie'),
               sex = rep('F', 5),
               X1880 = c(7065, 2604, 2003, 1939, 1746),
               X1881 = c(6919, 2698, 2034, 1852, 1653))

total <- data %>%
  summarise(across(X1880:X1881, sum)) %>%
  pivot_longer(everything(), names_to = 'year', values_to = 'total')

total

#   year  total
#   <chr> <dbl>
# 1 X1880 15357
# 2 X1881 15156

totalPerLetter <- data %>%
  mutate(letter = str_extract(name, '^.')) %>%
  select(letter, starts_with('X')) %>%
  pivot_longer(-letter, names_to = 'year', values_to = 'count') %>%
  group_by(letter, year) %>%
  mutate(count = sum(count)) %>%
  distinct()

totalPerLetter

#   letter year  count
#   <chr>  <chr> <dbl>
# 1 M      X1880  8811
# 2 M      X1881  8572
# 3 A      X1880  2604
# 4 A      X1881  2698
# 5 E      X1880  3942
# 6 E      X1881  3886

pctPerLetter <- totalPerLetter %>%
  group_by(year) %>%
  mutate(total = sum(count)) %>%
  ungroup() %>%
  mutate(percent = count/(total/100))

pctPerLetter

#   letter year  count total percent
#   <chr>  <chr> <dbl> <dbl>   <dbl>
# 1 M      X1880  8811 15357    57.4
# 2 M      X1881  8572 15156    56.6
# 3 A      X1880  2604 15357    17.0
# 4 A      X1881  2698 15156    17.8
# 5 E      X1880  3942 15357    25.7
# 6 E      X1881  3886 15156    25.6

As mentioned, consider reshaping data to long format (the better format in data analytics for merging, cleaning, aggregating, modeling, and plotting).

Reshape

girls_long <- reshape(girls, varying = names(girls)[4:ncol(girls)], times = names(girls)[4:ncol(girls)],
                      idvar = c("X", "name", "sex"),
                      v.names = "count", timevar = "year", ids=NULL,
                      new.row.names = 1:1E5, direction = "long")

girls_long$year <- as.integer(gsub("X", "", girls_long$year))
girls_long
#    X      name   sex  year count
# 1  1      Mary FALSE  1880  7065
# 2  2      Anna FALSE  1880  2604
# 3  3      Emma FALSE  1880  2003
# 4  4 Elizabeth FALSE  1880  1939
# 5  5    Minnie FALSE  1880  1746
# 6  1      Mary FALSE  1881  6919
# 7  2      Anna FALSE  1881  2698
# 8  3      Emma FALSE  1881  2034
# 9  4 Elizabeth FALSE  1881  1852
# 10 5    Minnie FALSE  1881  1653

Aggregations

# Total number of children born in that year
total_df <- aggregate(name ~ year, girls_long, FUN=length)
total_df
#   year count
# 1 1880 15357
# 2 1881 15156

# Total number of children born in that year whose name begins with each letter of the alphabet
girls_long$name_letter <- substring(girls_long$name, 1, 1)
girls_agg <- aggregate(cbind(count=name) ~ name_letter + year, girls_long, FUN=length)
girls_agg
#   name_letter year count
# 1           A 1880  2604
# 2           E 1880  3942
# 3           M 1880  8811
# 4           A 1881  2698
# 5           E 1881  3886
# 6           M 1881  8572

# Percent of children born in each year whose name begins with each letter
girls_agg$percent <- with(girls_agg, count / ave(count, year, FUN=sum))
girls_agg
#   name_letter year count   percent
# 1           A 1880  2604 0.1695644
# 2           E 1880  3942 0.2566908
# 3           M 1880  8811 0.5737449
# 4           A 1881  2698 0.1780153
# 5           E 1881  3886 0.2564001
# 6           M 1881  8572 0.5655846

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM