How to find rows with duplicate values in R
· 7 min read · Updated March 14, 2026 · beginner
r duplicates dplyr data.table data-frame
Finding duplicate rows is a common data cleaning task. This guide shows you multiple approaches using base R, dplyr, and data.table.
Finding Duplicates in a Vector
Base R: duplicated()
The duplicated() function returns a logical vector indicating which elements have been seen before:
x <- c(1, 2, 2, 3, 3, 3, 4, 5, 5)
duplicated(x)
# [1] FALSE FALSE TRUE TRUE TRUE FALSE TRUE FALSE TRUE
To see which values are duplicated (not where they occur):
x <- c(1, 2, 2, 3, 3, 3, 4, 5, 5)
duplicated(x) | duplicated(x, fromLast = TRUE)
# [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
Which elements are duplicates?
x <- c(1, 2, 2, 3, 3, 3, 4, 5, 5)
# Find positions of duplicated values
which(duplicated(x))
# [1] 3 4 5 7 9
# Get the actual duplicate values
unique(x[duplicated(x)])
# [1] 2 3 5
Finding Duplicate Rows in a Data Frame
Base R
df <- data.frame(
id = c(1, 2, 2, 3, 4, 1),
name = c("A", "B", "B", "C", "D", "A"),
score = c(85, 90, 90, 78, 88, 85)
)
duplicated(df)
# [1] FALSE FALSE TRUE FALSE FALSE TRUE
Get the row numbers of duplicates:
which(duplicated(df))
# [1] 3 6
Extract the duplicate rows:
df[duplicated(df), ]
# id name score
# 3 2 B 90
# 6 1 A 85
Find all duplicates including first occurrence
df[duplicated(df) | duplicated(df, fromLast = TRUE), ]
# id name score
# 1 1 A 85
# 3 2 B 90
# 6 1 A 85
dplyr
Use duplicated() or the group_by() approach:
library(dplyr)
df <- data.frame(
id = c(1, 2, 2, 3, 4, 1),
name = c("A", "B", "B", "C", "D", "A"),
score = c(85, 90, 90, 78, 88, 85)
)
# Find duplicate rows
df %>%
group_by_all() %>%
filter(n() > 1) %>%
ungroup()
# # A tibble: 4 × 3
# id name score
# <dbl> <chr> <dbl>
# 1 1 A 85
# 2 2 B 90
# 3 1 A 85
# 4 2 B 90
Find duplicates in specific columns only
library(dplyr)
df <- data.frame(
id = c(1, 2, 2, 3, 4, 1),
name = c("A", "B", "B", "C", "D", "A"),
score = c(85, 90, 90, 78, 88, 85)
)
# Find duplicates based on id column only
df %>%
group_by(id) %>%
filter(n() > 1) %>%
ungroup()
# # A tibble: 4 × 3
# id name score
# <dbl> <chr> <dbl>
# 1 1 A 85
# 2 2 B 90
# 3 1 A 85
# 4 2 B 90
# Or using the id and name columns
df %>%
group_by(id, name) %>%
filter(n() > 1) %>%
ungroup()
data.table
library(data.table)
dt <- data.table(
id = c(1, 2, 2, 3, 4, 1),
name = c("A", "B", "B", "C", "D", "A"),
score = c(85, 90, 90, 78, 88, 85)
)
# Find duplicate rows
duplicated(dt)
# [1] FALSE FALSE TRUE FALSE FALSE TRUE
# Get duplicate rows
dt[duplicated(dt)]
# id name score
# 1: 2 B 90
# 2: 1 A 85
# Using the anyDuplicated() function - returns index of first duplicate
anyDuplicated(dt)
# [1] 3
# Find duplicates by specific columns
dt[duplicated(dt, by = c("id", "name"))]
# id name score
# 1: 2 B 90
# 2: 1 A 85
Counting Duplicates
Count total duplicate rows
df <- data.frame(
id = c(1, 2, 2, 3, 4, 1),
name = c("A", "B", "B", "C", "D", "A"),
score = c(85, 90, 90, 78, 88, 85)
)
# Base R
sum(duplicated(df))
# [1] 2
# dplyr
library(dplyr)
df %>%
summarise(n_dup = sum(duplicated(.)))
# # A tibble: 1 × 1
# n_dup
# <int>
# 1 2
Count duplicates per group
library(dplyr)
df <- data.frame(
id = c(1, 2, 2, 3, 4, 1),
name = c("A", "B", "B", "C", "D", "A"),
score = c(85, 90, 90, 78, 88, 85)
)
# Count how many times each combination appears
df %>%
group_by(id, name) %>%
mutate(count = n()) %>%
filter(count > 1)
# # A tibble: 4 × 4
# id name score count
# <dbl> <chr> <dbl> <int>
# 1 1 A 85 2
# 2 2 B 90 2
# 3 1 A 85 2
# 4 2 B 90 2
Count unique vs total rows
df <- data.frame(
id = c(1, 2, 2, 3, 4, 1),
name = c("A", "B", "B", "C", "D", "A"),
score = c(85, 90, 90, 78, 88, 85)
)
nrow(df) # Total rows: 6
# [1] 6
nrow(unique(df)) # Unique rows: 4
# [1] 4
nrow(df) - nrow(unique(df)) # Duplicate rows: 2
# [1] 2
Removing Duplicate Rows
Base R
df <- data.frame(
id = c(1, 2, 2, 3, 4, 1),
name = c("A", "B", "B", "C", "D", "A"),
score = c(85, 90, 90, 78, 88, 85)
)
# Remove all duplicates (keeps first occurrence)
unique(df)
# id name score
# 1 1 A 85
# 2 2 B 90
# 4 3 C 78
# 5 4 D 88
# Or use duplicated()
df[!duplicated(df), ]
# id name score
# 1 1 A 85
# 2 2 B 90
# 4 3 C 78
# 5 4 D 88
# Keep last occurrence instead
df[!duplicated(df, fromLast = TRUE), ]
# id name score
# 3 2 B 90
# 4 3 C 78
# 5 4 D 88
# 6 1 A 85
dplyr
library(dplyr)
df <- data.frame(
id = c(1, 2, 2, 3, 4, 1),
name = c("A", "B", "B", "C", "D", "A"),
score = c(85, 90, 90, 78, 88, 85)
)
# Remove duplicate rows
distinct(df)
# # A tibble: 4 × 3
# id name score
# <dbl> <chr> <dbl>
# 1 1 A 85
# 2 2 B 90
# 3 3 C 78
# 4 4 D 88
# Remove duplicates based on specific columns
distinct(df, id, .keep_all = TRUE)
# # A tibble: 4 × 3
# id name score
# <dbl> <chr> <dbl>
# 1 1 A 85
# 2 2 B 90
# 3 3 C 78
# 4 4 D 88
data.table
library(data.table)
dt <- data.table(
id = c(1, 2, 2, 3, 4, 1),
name = c("A", "B", "B", "C", "D", "A"),
score = c(85, 90, 90, 78, 88, 85)
)
unique(dt)
# id name score
# 1: 1 A 85
# 2: 2 B 90
# 3: 3 C 78
# 4: 4 D 88
# Remove duplicates by specific columns
unique(dt, by = c("id", "name"))
# id name score
# 1: 1 A 85
# 2: 2 B 90
# 3: 3 C 78
# 4: 4 D 88
Practical Examples
Find customers with multiple orders
library(dplyr)
orders <- data.frame(
customer_id = c(101, 101, 102, 103, 103, 103),
order_id = c(1, 2, 3, 4, 5, 6),
amount = c(50, 75, 100, 25, 30, 45)
)
# Find customers with more than one order
orders %>%
group_by(customer_id) %>%
filter(n() > 1) %>%
ungroup()
# # A tibble: 5 × 3
# customer_id order_id amount
# <dbl> <dbl> <dbl>
# 1 101 1 50
# 2 101 2 75
# 3 103 4 25
# 4 103 5 30
# 5 103 6 45
Find records modified more than once
library(dplyr)
logs <- data.frame(
record_id = c("A", "A", "B", "C", "C"),
action = c("create", "update", "create", "create", "delete"),
timestamp = c(1, 2, 3, 4, 5)
)
# Find records that were touched multiple times
logs %>%
group_by(record_id) %>%
filter(n() > 1) %>%
ungroup()
Keep the row with maximum value per group
library(dplyr)
df <- data.frame(
id = c(1, 1, 2, 2, 3, 3),
time = c(10, 20, 15, 25, 30, 35),
value = c(100, 150, 200, 180, 90, 120)
)
# Keep row with maximum value per id
df %>%
group_by(id) %>%
slice_max(value, n = 1) %>%
ungroup()
# # A tibble: 3 × 3
# id time value
# <dbl> <dbl> <dbl>
# 1 1 20 150
# 2 2 25 180
# 3 3 35 120
Performance Comparison
For large datasets, data.table is fastest:
library(data.table)
dt <- data.table(
id = rep(1:1e6, each = 3),
value = rnorm(3e6)
)
system.time(dt[!duplicated(dt)])
# user system elapsed
# 0.089 0.010 0.099
Base R unique() is fastest for vectors. Use dplyr::distinct() for readability in pipelines, and data.table::unique() for large data.
See Also
- duplicated() — Find duplicate elements in a vector or rows in a data frame
- unique() — Extract unique elements or rows
- table() — Count frequency of values