Uploading data
Create a new dataset
library(redivis)
# Could also create a dataset under an organization:
# dataset <- redivis$organization("Demo organization")$dataset("some dataset")
dataset <- redivis$user("your-username")$dataset("some dataset")
# public_access_level can be one of ('none', 'overview', 'metadata', 'sample', 'data')
dataset$create(public_access_level="overview")
Create a table and upload data
library(redivis)
dataset <- redivis$user("user_name")$dataset("dataset_name", version="next")
# Create a table on the dataset. Datasets may have multiple tables
table = (
dataset
$table("Table name")
$create(description="Some description")
)
# Upload a file to the table.
# You can create multiple uploads per table, in which case they'll be appended together.
upload = table$upload("data.csv")
upload$create(
content="./data.csv", # Path to file, raw vector, or connection
type="delimited",
remove_on_fail=TRUE, # Remove the upload if a failure occurs
wait_for_finish=TRUE, # Wait for the upload to finish processing
raise_on_fail=TRUE # Raise an error on failure
)
Upload non-tabular (unstructured) files
library(redivis)
dataset <- redivis$user("user_name")$dataset("dataset_name", version="next")
# Non-tabular files must be uploaded to file index tables
table <- dataset$table("my_files")$create(is_file_index=TRUE)
# upload all contents in a directory
table$add_files(directory="/path/to/directory/")
# upload specific files
table$add_files(files=list(
list(path="/path/to/file.png"), # file name will be "file.png"
list(path="/path/to/other/file.png", name="other_file.png"), # file name will be other_file.png
list(data="Hello world", name="hello_world.txt") # Data can be string or raw vector
list(data=url("http://example.com"), name="example_com.html") # Data can be a connection
)
Upload data from an external source
# Assuming we get a reference to the table the same as above...
upload <- table$upload("data.csv")
upload$create(
transfer_specification=list(
sourceType="gcs", # one of gcs, s3, bigQuery, url, redivis
sourcePath="my-bucket/path-to-my-file.csv",
# sourcePath="https://example.com/data-file", (for sourceType == "url")
# sourcePath="workflow_name.dataset_name.table_name", (for sourceType == "bigQuery")
# sourcePath="owner_name.dataset_or_workflow_name.table_name", (for sourceType == "redivis")
identity="my_email@example.com" # The email associated with the data source
),
)
Stream data to an upload
library(redivis)
dataset <- redivis$user("user_name")$dataset("dataset_name", version="next")
table <- dataset$table("table_name")
# schema is optional if update_schema is set to True on the insert_rows request
schema <- list(
list(name="var1", type="string"),
list(name="var2", type="integer"),
list(name="var3", type="dateTime"),
)
# Construct a data.frame to send (or alternatively, a stringified json array of objects)
var1 <- c("hello", "world")
var2 <- c(1,2)
var3 <- c(NULL, "2020-01-01T00:00:00.123")
rows <- data.frame(var1, var2, var3)
# Reference each upload with its name, which must be unique amongst other uploads
# for the current version of this table.
upload <- table$upload(name="some_streamed_data")
# Only call create if the upload doesn't already exist
upload$create(
type="stream",
# schema is optional if update_schema is set to True on insert_rows
schema=schema,
# If True, will only create the upload if an upload with this name doesn't already exist
# Otherwise, a counter will be added to the name to preserve name uniqueness
if_not_exists=FALSE,
# If skip_bad_records is True, ignore records that are incompatible with the existing schema.
# This has no effect when update_schema is set to True on the insert_rows request.
skip_bad_records=FALSE # Optional, default is False
)
insert_response <- upload$insert_rows(
rows,
# If update_schema is set to True, variables can be added by subsequent streams,
# and variable types will be relaxed if new values are incompatible with the previous type.
# If False, an error will be thrown if a row would cause a schema update,
# unless skip_bad_records is set to True on the upload (in which case they'll be ignored)
update_schema=FALSE,
)
# See REST API / uploads / insertRows
print(insert_response)
Release a new version
library(redivis)
dataset <- redivis$user("username")$dataset("some dataset", version="next")
dataset$release()
Create a subsequent version on an existing dataset
library(redivis)
dataset <- redivis$user("your-username")$dataset("some dataset")
# dataset$create_next_version will throw an error if a "next" version already exists,
# unless the ignore_if_exists argument is provided
dataset <- dataset$create_next_version(ignore_if_exists=TRUE)
table <- dataset$table("table name")
# By default, all new data is appended to the previous version of a table.
# If you'd like to replace the previous data, update the upload_merge_strategy.
table$update(upload_merge_strategy="replace")
upload <- table.upload("data.csv")
upload.create(
content="./data.csv",
# All additional params are optional; default values are shown here
type="delimited", # One of stream, delimited, csv, ndjson, avro, parquet, orc, xls, xlsx, dta, sas7bdat, sav
skip_bad_records=FALSE,
has_header_row=TRUE, # Only relevant for csv, xls(x)
remove_on_fail=TRUE, # Remove the upload if a failure occurs
wait_for_finish=TRUE, # Wait for the upload to finish processing
raise_on_fail=TRUE # Raise an error on failure
# The following are only relevant for delimited files:
allow_quoted_newlines=FALSE, # Allow newlines within cells. Setting to True will substantially reduce ingest performance.
quote_character='"', # The character used to escape delimiters within cells. Generally a double quote in compliant CSVs.
delimiter=NULL, # For delimited files, explicitly set the delimiter, otherwise the delimiter will be automatically inferred.
)
# When all uploads have finished, release the next version
dataset$release()
Last updated
Was this helpful?