简体   繁体   中英

How to get the n most common items in each group

I'm trying to make a plot, where for each chapter in a book it shows the most common words for that chapter. The problem I'm having is that I'm using the top_n function with a value of 10 , but I'm not getting exactly 10 in each facet . Also I would like to know what is the difference here between using count and add_count . Here is the plot:

在此处输入图片说明

And the code:

library(tidytext)
library(tidyverse)
notw_processed %>%
  filter(chapter < 13) %>%
  count(chapter, word) %>%
  group_by(chapter) %>%
  top_n(10, n) %>%
  ungroup() %>%
  mutate(word = as_factor(word)) %>%
  mutate(word = reorder_within(word, n, chapter)) %>%
  ggplot(aes(x = word, y = n)) + geom_col() + coord_flip() + 
  facet_wrap(~chapter, scale = "free_y") + scale_x_reordered() 

And a sample from the data:

dput(notw_processed[sample(1:50000, size = 200, replace = FALSE),])
structure(list(linenumber = c(1884L, 3131L, 41L, 2756L, 1011L, 
538L, 3312L, 1856L, 2764L, 2691L, 3702L, 505L, 2090L, 2796L, 
1811L, 270L, 228L, 3088L, 3262L, 778L, 1446L, 1696L, 1839L, 1413L, 
3961L, 1375L, 306L, 895L, 1647L, 2037L, 822L, 2412L, 3266L, 1287L, 
3919L, 3900L, 141L, 1628L, 1459L, 465L, 3309L, 193L, 60L, 4040L, 
3276L, 3522L, 682L, 1338L, 394L, 2023L, 2929L, 3239L, 808L, 160L, 
206L, 2173L, 3818L, 203L, 383L, 1443L, 1693L, 645L, 1535L, 1974L, 
1557L, 3931L, 1877L, 1683L, 1154L, 1601L, 3548L, 1959L, 1625L, 
777L, 704L, 3054L, 2152L, 3624L, 2968L, 2035L, 1621L, 2275L, 
3625L, 805L, 2731L, 1334L, 2460L, 2294L, 684L, 896L, 371L, 1837L, 
2009L, 903L, 1020L, 3300L, 1504L, 1495L, 611L, 2208L, 2277L, 
2025L, 1991L, 584L, 1590L, 1468L, 610L, 2683L, 1697L, 156L, 2640L, 
3507L, 1975L, 163L, 2807L, 2285L, 1687L, 219L, 4069L, 3983L, 
1365L, 176L, 653L, 2226L, 4020L, 3841L, 1915L, 1455L, 486L, 3881L, 
2596L, 2252L, 1248L, 3879L, 364L, 2176L, 2304L, 2900L, 75L, 2488L, 
1852L, 3504L, 1547L, 2713L, 1574L, 3275L, 3061L, 3368L, 3628L, 
3883L, 1701L, 3637L, 3781L, 3042L, 836L, 354L, 2934L, 1781L, 
1964L, 113L, 1707L, 2609L, 2066L, 1882L, 3841L, 2362L, 3894L, 
466L, 2296L, 1230L, 2250L, 1816L, 3947L, 1668L, 139L, 1872L, 
3296L, 2878L, 206L, 2336L, 3852L, 730L, 3956L, 2311L, 373L, 17L, 
83L, 626L, 936L, 2165L, 2686L, 4030L, 1582L, 1120L, 1761L, 1002L, 
40L, 734L, 3733L, 3933L), chapter = c(23L, 41L, 1L, 37L, 12L, 
6L, 43L, 23L, 37L, 37L, 49L, 6L, 27L, 38L, 23L, 3L, 2L, 40L, 
43L, 9L, 17L, 22L, 23L, 16L, 52L, 16L, 3L, 11L, 21L, 26L, 10L, 
33L, 43L, 15L, 52L, 52L, 1L, 20L, 18L, 5L, 43L, 2L, 1L, 53L, 
43L, 46L, 8L, 16L, 4L, 26L, 39L, 43L, 9L, 1L, 2L, 29L, 50L, 2L, 
4L, 17L, 22L, 8L, 20L, 26L, 20L, 52L, 23L, 22L, 14L, 20L, 46L, 
26L, 20L, 9L, 8L, 40L, 28L, 46L, 40L, 26L, 20L, 31L, 46L, 9L, 
37L, 16L, 35L, 31L, 8L, 11L, 3L, 23L, 26L, 11L, 12L, 43L, 19L, 
19L, 7L, 30L, 31L, 26L, 26L, 7L, 20L, 18L, 7L, 37L, 22L, 1L, 
36L, 45L, 26L, 1L, 38L, 31L, 22L, 2L, 53L, 52L, 16L, 1L, 8L, 
31L, 53L, 51L, 24L, 18L, 6L, 52L, 36L, 31L, 14L, 52L, 3L, 29L, 
32L, 39L, 1L, 35L, 23L, 45L, 20L, 37L, 20L, 43L, 40L, 43L, 46L, 
52L, 22L, 46L, 50L, 40L, 10L, 3L, 39L, 23L, 26L, 1L, 22L, 36L, 
26L, 23L, 51L, 32L, 52L, 5L, 31L, 14L, 31L, 23L, 52L, 21L, 1L, 
23L, 43L, 38L, 2L, 32L, 51L, 8L, 52L, 32L, 3L, 1L, 1L, 7L, 12L, 
29L, 37L, 53L, 20L, 13L, 22L, 12L, 1L, 8L, 50L, 52L), word = c("choose", 
"remember", "demon", "manet", "question", "remembering", "finally", 
"times", "marks", "false", "approach", "plum", "unable", "head", 
"treated", "kote", "chronicler", "method", "locate", "thousand", 
"blinding", "hat", "world", "cinder’s", "rallying", "crack", 
"building", "expecting", "wrong", "sow", "god", "husband", "fela", 
"counter", "wil", "lump", "stew", "ate", "deep", "forehead", 
"untarnished", "horse", "west", "series", "archives", "thumb", 
"folk", "slight", "don’t", "leaden", "candle’s", "books", "powerful", 
"banished", "dried", "spoken", "you’re", "shape", "limping", 
"earlier", "customers", "eager", "wagon", "looked", "strangely", 
"yesterday", "finally", "frightening", "indignantly", "bit", 
"front", "pints", "squash", "taborlin", "trouble", "whipped", 
"skarpi", "command", "smile", "considered", "lay", "purse", "eyes", 
"symptoms", "tin", "troupers", "luggage", "penny", "bright", 
"bricks", "nodded", "mother", "dead", "imply", "should’ve", "front", 
"broke", "play", "story", "pulled", "found", "lay", "skarpi", 
"knowing", "smelled", "knots", "chronicler", "worth", "shouted", 
"stew", "pennies", "university", "pennies", "fine", "boy", "smells", 
"sound", "chronicler", "crescent", "stay", "proper", "soldiers", 
"tables", "shirt", "hoping", "riot", "boy", "time", "scribe", 
"prove", "sync", "haven’t", "talking", "tired", "smith’s", "half", 
"half", "plainly", "it’s", "called", "knees", "beck", "wouldn’t", 
"tray", "worth", "physically", "moment", "simmon", "simply", 
"meat", "forward", "impressive", "scarred", "ayes", "don’t", 
"street", "friends", "tanee", "friends", "eyes", "looked", "namer", 
"story", "eyes", "mains", "expressions", "shop", "listening", 
"lucky", "words", "half", "wicked", "candle", "fever", "fidget", 
"shook", "mind", "law", "incredibly", "favor", "grate", "read", 
"fierce", "urchin", "they’re", "broke", "chair", "call", "transferred", 
"remembered", "tarbean’s", "heard", "hot", "chronicler", "size", 
"silly", "wary", "mended", "thin", "dal")), row.names = c(NA, 
-200L), class = c("tbl_df", "tbl", "data.frame"))

As pointed out by @StupidWolf, in case of a tie then top_n returns all the ties, so it doesn't have to return exactly 10 cases.

notw_processed %>%
  filter(chapter < 13) %>%
  count(chapter, word) %>%
  group_by(chapter) %>%
  top_n(10, n) %>%
  slice(1:10) %>%
  ungroup() %>%
  mutate(word = as_factor(word)) %>%
  mutate(word = reorder_within(word, n, chapter)) %>%
  ggplot(aes(x = word, y = n)) + geom_col() + coord_flip() + 
  facet_wrap(~chapter, scale = "free") + scale_x_reordered() 

By grouping by chapter and slicing after the top_n call, I can ensure it will be exactly 10 values per facet.

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM