[英]grouping table by multiple factors and spreading it from long format to wide - the data.table way in R
[英]How to collapse rows in the tidy world or with data.table after spreading data long -> wide
我在呼叫中心工作,這些數據是我們的自動撥號系統的所有相關數據。 從2019年開始,我們公司開展了多項活動,以便為會員提供許多可能的處方,而不是1。
我們現在遇到的是會員X可能有6個處方,我們正試圖聯系他們,並且往往是重復的。 收集所有這些數據的部門會查看IDCARD_MBR_ID和TARGET_DOC_列,以獲取不關注處方葯名稱的唯一值。
我的目標是去除重復的處方葯。 並填補任何騰空的空間
葯物的順序也不總是相同的......這個簡短的樣本可能是這樣的:
H0000000001 (member data columns) Target1, Target2
H0000000001 (same member data) Target3, Target1, Target2
我想看到的是:
H0000000001 (member data columns) Target1, Target2,Target3
我已經通過聚集處理我的數據,刪除重復項,重新編號我的目標1-6(所以我可以創建一個新行,如果找到> 3)並將其傳播回去。 我的問題是我的傳播..我得到的東西看起來像這樣:
structure(list(IDCARD_MBR_ID = c("H0000000001", "H0000000001",
"H0000000001", "H0000000002", "H0000000002", "H0000000002", "H0000000003",
"H0000000003", "H0000000003", "H0000000004", "H0000000004", "H0000000004",
"H0000000005", "H0000000005", "H0000000005", "H0000000006", "H0000000006",
"H0000000006", "H0000000007", "H0000000007", "H0000000007", "H0000000008",
"H0000000008", "H0000000008", "H0000000009", "H0000000009", "H0000000009",
"H0000000010", "H0000000010", "H0000000010", "H0000000011", "H0000000011",
"H0000000011", "H0000000012", "H0000000012", "H0000000012"),
SDR_PERSON_ID = c("ID000000001", "ID000000001", "ID000000001",
"ID000000002", "ID000000002", "ID000000002", "ID000000003",
"ID000000003", "ID000000003", "ID000000004", "ID000000004",
"ID000000004", "ID000000005", "ID000000005", "ID000000005",
"ID000000006", "ID000000006", "ID000000006", "ID000000007",
"ID000000007", "ID000000007", "ID000000008", "ID000000008",
"ID000000008", "ID000000009", "ID000000009", "ID000000009",
"ID000000010", "ID000000010", "ID000000010", "ID000000011",
"ID000000011", "ID000000011", "ID000000012", "ID000000012",
"ID000000012"), FIRST_NAME = c("First1", "First1", "First1",
"First2", "First2", "First2", "First3", "First3", "First3",
"First4", "First4", "First4", "First5", "First5", "First5",
"First6", "First6", "First6", "First7", "First7", "First7",
"First8", "First8", "First8", "First9", "First9", "First9",
"First10", "First10", "First10", "First11", "First11", "First11",
"First12", "First12", "First12"), LAST_NAME = c("Last1",
"Last1", "Last1", "Last2", "Last2", "Last2", "Last3", "Last3",
"Last3", "Last4", "Last4", "Last4", "Last5", "Last5", "Last5",
"Last6", "Last6", "Last6", "Last7", "Last7", "Last7", "Last8",
"Last8", "Last8", "Last9", "Last9", "Last9", "Last10", "Last10",
"Last10", "Last11", "Last11", "Last11", "Last12", "Last12",
"Last12"), GENDER = c("F", "F", "F", "M", "M", "M", "M",
"M", "M", "F", "F", "F", "F", "F", "F", "F", "F", "F", "M",
"M", "M", "F", "F", "F", "F", "F", "F", "M", "M", "M", "F",
"F", "F", "F", "F", "F"), DOB = c("1901-01-01", "1901-01-01",
"1901-01-01", "1901-01-02", "1901-01-02", "1901-01-02", "1901-01-03",
"1901-01-03", "1901-01-03", "1901-01-04", "1901-01-04", "1901-01-04",
"1901-01-05", "1901-01-05", "1901-01-05", "1901-01-06", "1901-01-06",
"1901-01-06", "1901-01-07", "1901-01-07", "1901-01-07", "1901-01-08",
"1901-01-08", "1901-01-08", "1901-01-09", "1901-01-09", "1901-01-09",
"1901-01-10", "1901-01-10", "1901-01-10", "1901-01-11", "1901-01-11",
"1901-01-11", "1901-01-12", "1901-01-12", "1901-01-12"),
ADDRESS1 = c("123 Testing Way", "123 Testing Way", "123 Testing Way",
"456 Testing Way", "456 Testing Way", "456 Testing Way",
"789 Testing Way", "789 Testing Way", "789 Testing Way",
"1122 Testing Way", "1122 Testing Way", "1122 Testing Way",
"1455 Testing Way", "1455 Testing Way", "1455 Testing Way",
"1788 Testing Way", "1788 Testing Way", "1788 Testing Way",
"2121 Testing Way", "2121 Testing Way", "2121 Testing Way",
"2454 Testing Way", "2454 Testing Way", "2454 Testing Way",
"2787 Testing Way", "2787 Testing Way", "2787 Testing Way",
"3120 Testing Way", "3120 Testing Way", "3120 Testing Way",
"3453 Testing Way", "3453 Testing Way", "3453 Testing Way",
"3786 Testing Way", "3786 Testing Way", "3786 Testing Way"
), ADDRESS2 = c(NA, NA, NA, NA, NA, NA, "APT 302", "APT 302",
"APT 302", "APT 225", "APT 225", "APT 225", NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA), CITY = c("CLARKSVILLE", "CLARKSVILLE",
"CLARKSVILLE", "MONROE", "MONROE", "MONROE", "LOS ANGELES",
"LOS ANGELES", "LOS ANGELES", "TULSA", "TULSA", "TULSA",
"RED BUD", "RED BUD", "RED BUD", "MISSION VIEJO", "MISSION VIEJO",
"MISSION VIEJO", "MILLCREEK", "MILLCREEK", "MILLCREEK", "COLUMBIA",
"COLUMBIA", "COLUMBIA", "HOUSTON", "HOUSTON", "HOUSTON",
"APOPKA", "APOPKA", "APOPKA", "SEALY", "SEALY", "SEALY",
"SEWARD", "SEWARD", "SEWARD"), ST = c("IN", "IN", "IN", "MI",
"MI", "MI", "CA", "CA", "CA", "OK", "OK", "OK", "IL", "IL",
"IL", "CA", "CA", "CA", "UT", "UT", "UT", "MO", "MO", "MO",
"TX", "TX", "TX", "FL", "FL", "FL", "TX", "TX", "TX", "NE",
"NE", "NE"), ZIP = c(47129L, 47129L, 47129L, 48162L, 48162L,
48162L, 90028L, 90028L, 90028L, 74136L, 74136L, 74136L, 62278L,
62278L, 62278L, 92692L, 92692L, 92692L, 84124L, 84124L, 84124L,
65201L, 65201L, 65201L, 77057L, 77057L, 77057L, 32703L, 32703L,
32703L, 77474L, 77474L, 77474L, 68434L, 68434L, 68434L),
EMAIL_ADDRESS = c("Test-mail@test.com", "Test-mail@test.com",
"Test-mail@test.com", NA, NA, NA, NA, NA, NA, NA, NA, NA,
"Test-mail@test.com", "Test-mail@test.com", "Test-mail@test.com",
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "Test-mail@test.com",
"Test-mail@test.com", "Test-mail@test.com", NA, NA, NA, NA,
NA, NA), PHONE1 = c(1L, 1L, 1L, 2L, 2L, 2L, 3L, 3L, 3L, 4L,
4L, 4L, 5L, 5L, 5L, 6L, 6L, 6L, 7L, 7L, 7L, 8L, 8L, 8L, 9L,
9L, 9L, 10L, 10L, 10L, 11L, 11L, 11L, 12L, 12L, 12L), ARG_CLIENT_NUM = c(6L,
6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L,
6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L,
6L, 6L, 6L, 6L, 6L), FORMULARYID = c(19430L, 19430L, 19431L,
19431L, 19431L, 19430L, 19430L, 19430L, 19430L, 19430L, 19430L,
19430L, 19430L, 19430L, 19430L, 19431L, 19431L, 19431L, 19430L,
19430L, 19431L, 19431L, 19431L, 19430L, 19430L, 19430L, 19431L,
19430L, 19430L, 19430L, 19430L, 19430L, 19430L, 19430L, 19430L,
19431L), CAMPAIGN_TYPE = c("PDP_Lean MAPD", "PDP_Lean MAPD",
"PDP_Lean MAPD", "PDP_Lean MAPD", "PDP_Lean MAPD", "PDP_Lean MAPD",
"PDP_Lean MAPD", "PDP_Lean MAPD", "PDP_Lean MAPD", "PDP_Lean MAPD",
"PDP_Lean MAPD", "PDP_Lean MAPD", "PDP_Lean MAPD", "PDP_Lean MAPD",
"PDP_Lean MAPD", "PDP_Lean MAPD", "PDP_Lean MAPD", "PDP_Lean MAPD",
"PDP_Lean MAPD", "PDP_Lean MAPD", "PDP_Lean MAPD", "PDP_Lean MAPD",
"PDP_Lean MAPD", "PDP_Lean MAPD", "PDP_Lean MAPD", "PDP_Lean MAPD",
"PDP_Lean MAPD", "PDP_Lean MAPD", "PDP_Lean MAPD", "PDP_Lean MAPD",
"PDP_Lean MAPD", "PDP_Lean MAPD", "PDP_Lean MAPD", "PDP_Lean MAPD",
"PDP_Lean MAPD", "PDP_Lean MAPD"), LIS_INDICATOR = c("N",
"N", "N", "N", "N", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y",
"Y", "N", "N", "N", "N", "Y", "Y", "N", "N", "N", "N", "N",
"N", "N", "Y", "Y", "N", "Y", "Y", "Y", "Y", "Y", "N"), MONTH = c("February / DIALER",
"February / DIALER", "January / DIALER", "January / DIALER",
"January / DIALER", "January / DIALER", "February / DIALER",
"February / DIALER", "January / DIALER", "January / DIALER",
"January / DIALER", "January / DIALER", "January / DIALER",
"January / DIALER", "January / DIALER", "January / DIALER",
"January / DIALER", "January / DIALER", "January / DIALER",
"January / DIALER", "February / DIALER", "January / DIALER",
"January / DIALER", "February / DIALER", "January / DIALER",
"January / DIALER", "January / DIALER", "January / DIALER",
"January / DIALER", "February / DIALER", "January / DIALER",
"January / DIALER", "January / DIALER", "January / DIALER",
"January / DIALER", "January / DIALER"), Calltype1 = c("Cost Savings",
"Cost Savings", "Cost Savings", "Cost Savings", "Cost Savings",
"Cost Savings", "Cost Savings", "Cost Savings", "Cost Savings",
"Cost Savings", "Cost Savings", "Cost Savings", "Cost Savings",
"Cost Savings", "Cost Savings", "Cost Savings", "Cost Savings",
"Cost Savings", "Cost Savings", "Cost Savings", "Cost Savings",
"Cost Savings", "Cost Savings", "Cost Savings", "Cost Savings",
"Cost Savings", "Cost Savings", "Cost Savings", "Cost Savings",
"Cost Savings", "Cost Savings", "Cost Savings", "Cost Savings",
"Cost Savings", "Cost Savings", "Cost Savings"), Calltype2 = c("Cost Savings",
"Cost Savings", "Cost Savings", "Cost Savings", "Cost Savings",
"Cost Savings", "Cost Savings", "Cost Savings", "Cost Savings",
"Cost Savings", "Cost Savings", "Cost Savings", "Cost Savings",
"Cost Savings", "Cost Savings", "Cost Savings", "Cost Savings",
"Cost Savings", "Cost Savings", "Cost Savings", "Cost Savings",
"Cost Savings", "Cost Savings", "Cost Savings", "Cost Savings",
"Cost Savings", "Cost Savings", "Cost Savings", "Cost Savings",
"Cost Savings", "Cost Savings", "Cost Savings", "Cost Savings",
"Cost Savings", "Cost Savings", "Cost Savings"), Calltype3 = c("Cost Savings",
"Cost Savings", "Cost Savings", "Cost Savings", "Cost Savings",
"Cost Savings", "Cost Savings", "Cost Savings", "Cost Savings",
"Cost Savings", "Cost Savings", "Cost Savings", "Cost Savings",
"Cost Savings", "Cost Savings", "Cost Savings", "Cost Savings",
"Cost Savings", "Cost Savings", "Cost Savings", "Cost Savings",
"Cost Savings", "Cost Savings", "Cost Savings", "Cost Savings",
"Cost Savings", "Cost Savings", "Cost Savings", "Cost Savings",
"Cost Savings", "Cost Savings", "Cost Savings", "Cost Savings",
"Cost Savings", "Cost Savings", "Cost Savings"), TARGET_DOC_1 = c("A000000000025001",
NA, NA, "A000000000026001", NA, NA, "A000000000027001", NA,
NA, "A000000000028001", NA, NA, "A000000000029001", NA, NA,
"A000000000030001", NA, NA, "A000000000031001", NA, NA, "A000000000032001",
NA, NA, "A000000000033001", NA, NA, "A000000000034001", NA,
NA, "A000000000035001", NA, NA, "A000000000036001", NA, NA
), TARGET_DOC_2 = c(NA, "A000000000013001", NA, NA, "A000000000014001",
NA, NA, "A000000000015001", NA, NA, "A000000000016001", NA,
NA, "A000000000017001", NA, NA, "A000000000018001", NA, NA,
"A000000000019001", NA, NA, "A000000000020001", NA, NA, "A000000000021001",
NA, NA, "A000000000022001", NA, NA, "A000000000023001", NA,
NA, "A000000000024001", NA), TARGET_DOC_3 = c(NA, NA, "A000000000001001",
NA, NA, "A000000000002001", NA, NA, "A000000000003001", NA,
NA, "A000000000004001", NA, NA, "A000000000005001", NA, NA,
"A000000000006001", NA, NA, "A000000000007001", NA, NA, "A000000000008001",
NA, NA, "A000000000009001", NA, NA, "A000000000010001", NA,
NA, "A000000000011001", NA, NA, "A000000000012001"), TARGET_DRUG_ID_1 = c(57237014401,
NA, NA, 65862058301, NA, NA, 57237014301, NA, NA, 68180075701,
NA, NA, 65862058401, NA, NA, 55111033901, NA, NA, 65862058201,
NA, NA, 65862058405, NA, NA, 65862058601, NA, NA, 68180075801,
NA, NA, 65862058601, NA, NA, 68180076001, NA, NA), TARGET_DRUG_ID_2 = c(NA,
57237014401, NA, NA, 65862058301, NA, NA, 57237014301, NA,
NA, 68180075701, NA, NA, 65862058401, NA, NA, 55111033901,
NA, NA, 65862058201, NA, NA, 65862058405, NA, NA, 65862058601,
NA, NA, 68180075801, NA, NA, 65862058601, NA, NA, 68180076001,
NA), TARGET_DRUG_ID_3 = c(NA, NA, 65862058701, NA, NA, 65862058701,
NA, NA, 55111034001, NA, NA, 65862058601, NA, NA, 65862058701,
NA, NA, 55111034001, NA, NA, 65862058401, NA, NA, 65862058405,
NA, NA, 68180047301, NA, NA, 68180075801, NA, NA, 65862058701,
NA, NA, 65862058301), TARGET_NAME_1 = c("Target11", NA, NA,
"Target12", NA, NA, "Target13", NA, NA, "Target14", NA, NA,
"Target15", NA, NA, "Target16", NA, NA, "Target17", NA, NA,
"Target18", NA, NA, "Target19", NA, NA, "Target20", NA, NA,
"Target21", NA, NA, "Target22", NA, NA), TARGET_NAME_2 = c(NA,
"Target21", NA, NA, "Target22", NA, NA, "Target23", NA, NA,
"Target24", NA, NA, "Target25", NA, NA, "Target26", NA, NA,
"Target27", NA, NA, "Target28", NA, NA, "Target29", NA, NA,
"Target30", NA, NA, "Target31", NA, NA, "Target32", NA),
TARGET_NAME_3 = c(NA, NA, "Target31", NA, NA, "Target32",
NA, NA, "Target33", NA, NA, "Target34", NA, NA, "Target35",
NA, NA, "Target36", NA, NA, "Target37", NA, NA, "Target38",
NA, NA, "Target39", NA, NA, "Target40", NA, NA, "Target41",
NA, NA, "Target42")), class = "data.frame", row.names = c(NA,
-36L))}
每組目標都在各自的行上。我無法弄清楚如何折疊它們並刪除NA,這樣我最終得到的結果如下:
structure(list(IDCARD_MBR_ID = c("H0000000001", "H0000000002",
"H0000000003", "H0000000004", "H0000000005", "H0000000006", "H0000000007",
"H0000000008", "H0000000009", "H0000000010", "H0000000011", "H0000000012"
), SDR_PERSON_ID = c("ID000000001", "ID000000002", "ID000000003",
"ID000000004", "ID000000005", "ID000000006", "ID000000007", "ID000000008",
"ID000000009", "ID000000010", "ID000000011", "ID000000012"),
FIRST_NAME = c("First1", "First2", "First3", "First4", "First5",
"First6", "First7", "First8", "First9", "First10", "First11",
"First12"), LAST_NAME = c("Last1", "Last2", "Last3", "Last4",
"Last5", "Last6", "Last7", "Last8", "Last9", "Last10", "Last11",
"Last12"), GENDER = c("F", "M", "M", "F", "F", "F", "M",
"F", "F", "M", "F", "F"), DOB = c("1/1/1901", "1/2/1901",
"1/3/1901", "1/4/1901", "1/5/1901", "1/6/1901", "1/7/1901",
"1/8/1901", "1/9/1901", "1/10/1901", "1/11/1901", "1/12/1901"
), ADDRESS1 = c("123 Testing Way", "456 Testing Way", "789 Testing Way",
"1122 Testing Way", "1455 Testing Way", "1788 Testing Way",
"2121 Testing Way", "2454 Testing Way", "2787 Testing Way",
"3120 Testing Way", "3453 Testing Way", "3786 Testing Way"
), ADDRESS2 = c(NA, NA, "APT 302", "APT 225", NA, NA, NA,
NA, NA, NA, NA, NA), CITY = c("CLARKSVILLE", "MONROE", "LOS ANGELES",
"TULSA", "RED BUD", "MISSION VIEJO", "MILLCREEK", "COLUMBIA",
"HOUSTON", "APOPKA", "SEALY", "SEWARD"), ST = c("IN", "MI",
"CA", "OK", "IL", "CA", "UT", "MO", "TX", "FL", "TX", "NE"
), ZIP = c(47129L, 48162L, 90028L, 74136L, 62278L, 92692L,
84124L, 65201L, 77057L, 32703L, 77474L, 68434L), EMAIL_ADDRESS = c("Test-mail@test.com",
NA, NA, NA, "Test-mail@test.com", NA, NA, NA, NA, "Test-mail@test.com",
NA, NA), PHONE1 = 1:12, ARG_CLIENT_NUM = c(6L, 6L, 6L, 6L,
6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L), FORMULARYID = c(19430L,
19431L, 19430L, 19430L, 19430L, 19431L, 19430L, 19431L, 19430L,
19430L, 19430L, 19430L), CAMPAIGN_TYPE = c("PDP_Lean MAPD",
"PDP_Lean MAPD", "PDP_Lean MAPD", "PDP_Lean MAPD", "PDP_Lean MAPD",
"PDP_Lean MAPD", "PDP_Lean MAPD", "PDP_Lean MAPD", "PDP_Lean MAPD",
"PDP_Lean MAPD", "PDP_Lean MAPD", "PDP_Lean MAPD"), LIS_INDICATOR = c("N",
"N", "Y", "Y", "Y", "N", "Y", "N", "N", "Y", "Y", "Y"), MONTH = c("February / DIALER",
"January / DIALER", "February / DIALER", "January / DIALER",
"January / DIALER", "January / DIALER", "January / DIALER",
"January / DIALER", "January / DIALER", "January / DIALER",
"January / DIALER", "January / DIALER"), Calltype1 = c("Cost Savings",
"Cost Savings", "Cost Savings", "Cost Savings", "Cost Savings",
"Cost Savings", "Cost Savings", "Cost Savings", "Cost Savings",
"Cost Savings", "Cost Savings", "Cost Savings"), Calltype2 = c("Cost Savings",
"Cost Savings", "Cost Savings", "Cost Savings", "Cost Savings",
"Cost Savings", "Cost Savings", "Cost Savings", "Cost Savings",
"Cost Savings", "Cost Savings", "Cost Savings"), Calltype3 = c("Cost Savings",
"Cost Savings", "Cost Savings", "Cost Savings", "Cost Savings",
"Cost Savings", "Cost Savings", "Cost Savings", "Cost Savings",
"Cost Savings", "Cost Savings", "Cost Savings"), TARGET_DOC_1 = c("A000000000025001",
"A000000000026001", "A000000000027001", "A000000000028001",
"A000000000029001", "A000000000030001", "A000000000031001",
"A000000000032001", "A000000000033001", "A000000000034001",
"A000000000035001", "A000000000036001"), TARGET_DOC_2 = c("A000000000013001",
"A000000000014001", "A000000000015001", "A000000000016001",
"A000000000017001", "A000000000018001", "A000000000019001",
"A000000000020001", "A000000000021001", "A000000000022001",
"A000000000023001", "A000000000024001"), TARGET_DOC_3 = c("A000000000001001",
"A000000000002001", "A000000000003001", "A000000000004001",
"A000000000005001", "A000000000006001", "A000000000007001",
"A000000000008001", "A000000000009001", "A000000000010001",
"A000000000011001", "A000000000012001"), TARGET_DRUG_ID_1 = c(57237014401,
65862058301, 57237014301, 68180075701, 65862058401, 55111033901,
65862058201, 65862058405, 65862058601, 68180075801, 65862058601,
68180076001), TARGET_DRUG_ID_2 = c(57237014401, 65862058301,
57237014301, 68180075701, 65862058401, 55111033901, 65862058201,
65862058405, 65862058601, 68180075801, 65862058601, 68180076001
), TARGET_DRUG_ID_3 = c(65862058701, 65862058701, 55111034001,
65862058601, 65862058701, 55111034001, 65862058401, 65862058405,
68180047301, 68180075801, 65862058701, 65862058301), TARGET_NAME_1 = c("Target11",
"Target12", "Target13", "Target14", "Target15", "Target16",
"Target17", "Target18", "Target19", "Target20", "Target21",
"Target22"), TARGET_NAME_2 = c("Target21", "Target22", "Target23",
"Target24", "Target25", "Target26", "Target27", "Target28",
"Target29", "Target30", "Target31", "Target32"), TARGET_NAME_3 = c("Target31",
"Target32", "Target33", "Target34", "Target35", "Target36",
"Target37", "Target38", "Target39", "Target40", "Target41",
"Target42")), class = "data.frame", row.names = c(NA, -12L
))
我一直在尋找類似這樣的問題:折疊重復的行使用R在幾列中折疊具有不同值的重復行
我最大的問題是我的列數要大得多(必要時......所有行都需要保持一致以保持完整性)我很快就迷失了。
編輯1:添加DF的'頭(12)'(注意DF很寬,難以粘貼以保持格式化..這里是受問題影響的列)...另外,不確定為什么在這個測試數據中有些NA是,有些是NA ..在我的真實數據中,它們是真正的缺失值NA。
Calltype3 TARGET_DOC_1 TARGET_DOC_2 TARGET_DOC_3 TARGET_DRUG_ID_1 TARGET_DRUG_ID_2 TARGET_DRUG_ID_3
1 Cost Savings A000000000025001 <NA> <NA> 57237014401 NA NA
2 Cost Savings <NA> A000000000013001 <NA> NA 57237014401 NA
3 Cost Savings <NA> <NA> A000000000001001 NA NA 65862058701
4 Cost Savings A000000000026001 <NA> <NA> 65862058301 NA NA
5 Cost Savings <NA> A000000000014001 <NA> NA 65862058301 NA
6 Cost Savings <NA> <NA> A000000000002001 NA NA 65862058701
7 Cost Savings A000000000027001 <NA> <NA> 57237014301 NA NA
8 Cost Savings <NA> A000000000015001 <NA> NA 57237014301 NA
9 Cost Savings <NA> <NA> A000000000003001 NA NA 55111034001
10 Cost Savings A000000000028001 <NA> <NA> 68180075701 NA NA
11 Cost Savings <NA> A000000000016001 <NA> NA 68180075701 NA
12 Cost Savings <NA> <NA> A000000000004001 NA NA 65862058601
TARGET_NAME_1 TARGET_NAME_2 TARGET_NAME_3
1 Target11 <NA> <NA>
2 <NA> Target21 <NA>
3 <NA> <NA> Target31
4 Target12 <NA> <NA>
5 <NA> Target22 <NA>
6 <NA> <NA> Target32
7 Target13 <NA> <NA>
8 <NA> Target23 <NA>
9 <NA> <NA> Target33
10 Target14 <NA> <NA>
11 <NA> Target24 <NA>
12 <NA> <NA> Target34
編輯4:根據Humpelstielzchen的建議,我跑了這個:我的傳播DF = spread1
spread2 <- spread1 %>%
group_by(IDCARD_MBR_ID) %>%
mutate(MONTH = MONTH[row_number()==1],
FORMULARYID = FORMULARYID[row_number()==1],
LIS_INDICATOR = LIS_INDICATOR[row_number() ==1]) %>%
group_by_at(vars(-starts_with("TARGET"))) %>%
summarise_at(vars(starts_with("TARGET")), list(. = ~.[which(!is.na(.))[1]]))
它似乎工作正常; 我有一個超過100萬起始行的數據框,這有助於我減少到~600k; 我需要一點時間來驗證准確性,但看起來很棒!
嘗試這個:
library(dplyr)
df1 %>%
group_by(IDCARD_MBR_ID) %>%
mutate(MONTH = MONTH[row_number() == 1],
FORMULARYID = FORMULARYID[row_number() == 1],
LIS_INDICATOR = LIS_INDICATOR[row_number() == 1]) %>%
group_by_at(vars(-starts_with("TARGET"))) %>%
summarise_at(vars(starts_with("TARGET")), list(. = ~.[which(!is.na(.))[1]])) -> res
> res[,c("IDCARD_MBR_ID", "TARGET_NAME_1_.", "TARGET_NAME_2_.", "TARGET_NAME_3_." )]
# A tibble: 12 x 4
IDCARD_MBR_ID TARGET_NAME_1_. TARGET_NAME_2_. TARGET_NAME_3_.
<chr> <chr> <chr> <chr>
1 H0000000001 Target11 Target21 Target31
2 H0000000002 Target12 Target22 Target32
3 H0000000003 Target13 Target23 Target33
4 H0000000004 Target14 Target24 Target34
5 H0000000005 Target15 Target25 Target35
6 H0000000006 Target16 Target26 Target36
7 H0000000007 Target17 Target27 Target37
8 H0000000008 Target18 Target28 Target38
9 H0000000009 Target19 Target29 Target39
10 H0000000010 Target20 Target30 Target40
11 H0000000011 Target21 Target31 Target41
12 H0000000012 Target22 Target32 Target42
我必須均衡一些分組變量(MONTH,FORMULARYID,LIS_INDICATOR),因為它們在同一個ID內變化。 我剛為每個ID選擇了第一個,您可以根據自己的喜好進行調整。
聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.