Read and prepare intraday stock data
This notebook reads the raw 1-minute stock price files and converts them into a single return dataset.
Before running the code, download the raw intraday data for all required stocks and place the .txt files in [YOUR FOLDER]. Each file should be named exactly after the corresponding ticker symbol, for example NVDA.txt, MSFT.txt, or SP500.txt.
The script checks whether all required files are available, reads the price data, keeps observations during regular U.S. market hours, aligns all series by timestamp, forward-fills missing prices, computes 1-minute log returns, replaces any remaining missing values with zero, and saves the resulting dataset as full_dataset.csv in the same folder.
R code
Cell 2
Show code
library(readr)
library(dplyr)
library(purrr)
library(zoo)
folder <- "[YOUR FOLDER]"
output_file <- file.path(folder, "full_dataset.csv")
start_date <- as.Date("2021-05-20")
end_date <- as.Date("2026-03-10")
dates <- format(seq.Date(from = start_date, to = end_date, by = "day"), "%Y-%m-%d")
tickers_m7 <- c("NVDA", "MSFT", "AMZN", "META", "GOOG", "TSLA", "AAPL")
tickers_computing <- c("AVGO", "AMD", "MU")
tickers_platform <- c("ORCL", "IBM", "PLTR", "CSCO")
tickers_disruption_adoption <- c("CRM", "NOW", "INTU", "WDAY", "ADBE")
non_ai_tickers <- c("WMT", "COST", "CVX", "ABBV", "PG", "MRK", "PM", "MCD", "PEP", "T")
selected_tickers <- c(
tickers_m7,
tickers_computing,
tickers_platform,
tickers_disruption_adoption,
non_ai_tickers,
"SP500"
)
if (!dir.exists(folder)) {
stop(paste("Folder not found:", folder))
}
missing_files <- selected_tickers[
!file.exists(file.path(folder, paste0(selected_tickers, ".txt")))
]
if (length(missing_files) > 0) {
stop(
paste(
"The following files are missing:",
paste0(missing_files, ".txt", collapse = "\n"),
sep = "\n"
)
)
}
all_data <- list()
for (ticker in selected_tickers) {
path <- file.path(folder, paste0(ticker, ".txt"))
df <- read_csv(
path,
col_names = c("datetime", "open", "high", "low", "price", "volume"),
show_col_types = FALSE
)
original_datetime <- df$datetime
parsed_datetime <- as.POSIXct(original_datetime, tz = "America/New_York")
if (all(is.na(parsed_datetime))) {
parsed_datetime <- as.POSIXct(original_datetime, tz = "UTC")
parsed_datetime <- as.POSIXct(
format(parsed_datetime, tz = "America/New_York", usetz = TRUE),
tz = "America/New_York"
)
}
df$datetime <- parsed_datetime
df$date <- format(df$datetime, "%Y-%m-%d")
df$time <- format(df$datetime, "%H:%M:%S")
df <- df %>%
filter(!is.na(datetime), !is.na(price)) %>%
filter(date %in% dates) %>%
filter(time >= "09:30:00", time <= "16:00:00") %>%
select(datetime, price)
colnames(df)[2] <- ticker
all_data[[ticker]] <- df
}
data <- reduce(all_data, full_join, by = "datetime")
data <- data %>%
arrange(datetime) %>%
distinct(datetime, .keep_all = TRUE)
data[selected_tickers] <- lapply(data[selected_tickers], zoo::na.locf, na.rm = FALSE)
data <- data[, c("datetime", selected_tickers)]
data_returns <- data %>%
arrange(datetime) %>%
mutate(across(all_of(selected_tickers), ~ log(.) - log(lag(.)))) %>%
mutate(across(all_of(selected_tickers), ~ ifelse(is.na(.), 0, .)))
write_csv(data_returns, output_file)
write_csv(data, output_file)