Uploading data

Create a new dataset

library(redivis)

# Could also create a dataset under an organization:
# dataset <- redivis$organization("Demo organization")$dataset("some dataset")
dataset <- redivis$user("your-username")$dataset("some dataset")

# public_access_level can be one of ('none', 'overview', 'metadata', 'sample', 'data')
dataset$create(public_access_level="overview")

Create a table and upload data

library(redivis)

dataset <- redivis$user("user_name")$dataset("dataset_name", version="next")

# Create a table on the dataset. Datasets may have multiple tables
table = (
    dataset
    $table("Table name")
    $create(description="Some description")
)

# Upload a file to the table. 
# You can create multiple uploads per table, in which case they'll be appended together.
upload = table$upload()$create(
    "./data.csv",           # Path to file, data.frame, raw vector, etc
    type="delimited",       # Inferred from file extension if not provided
    remove_on_fail=TRUE,    # Remove the upload if a failure occurs
    wait_for_finish=TRUE,   # Wait for the upload to finish processing
    raise_on_fail=TRUE      # Raise an error on failure
)

Upload non-tabular (unstructured) files

library(redivis)
dataset <- redivis$user("user_name")$dataset("dataset_name", version="next")

# Non-tabular files must be uploaded to file index tables
table <- dataset$table("my_files")$create(is_file_index=TRUE)

# upload all contents in a directory
table$add_files(directory="/path/to/directory/")

# upload specific files
table$add_files(files=list(
    list(path="/path/to/file.png"), # file name will be "file.png"
    list(path="/path/to/other/file.png", name="other_file.png"), # file name will be other_file.png
    list(data="Hello world", name="hello_world.txt") # Data can be string or raw vector 
    list(data=url("http://example.com"), name="example_com.html") # Data can be a connection
)

Upload data from an external source

# Assuming we get a reference to the table the same as above...

upload <- table$upload("data.csv")

upload$create(
    transfer_specification=list(
        sourceType="gcs", # one of gcs, s3, bigQuery, url, redivis
        sourcePath="my-bucket/path-to-my-file.csv", 
        # sourcePath="https://example.com/data-file", (for sourceType == "url")
        # sourcePath="workflow_name.dataset_name.table_name", (for sourceType == "bigQuery")
        # sourcePath="owner_name.dataset_or_workflow_name.table_name", (for sourceType == "redivis")
        identity="[email protected]" # The email associated with the data source
    ),
)

Stream data to an upload

library(redivis)

dataset <- redivis$user("user_name")$dataset("dataset_name", version="next")
table <- dataset$table("table_name")

# schema is optional if update_schema is set to True on the insert_rows request
schema <- list(
    list(name="var1", type="string"), 
    list(name="var2", type="integer"), 
    list(name="var3", type="dateTime"), 
)

# Construct a data.frame to send (or alternatively, a stringified json array of objects)
var1 <- c("hello", "world")
var2 <- c(1,2)
var3 <- c(NULL, "2020-01-01T00:00:00.123")

rows <- data.frame(var1, var2, var3)

# Reference each upload with its name, which must be unique amongst other uploads
#   for the current version of this table.
upload <- table$upload(name="some_streamed_data")

# Only call create if the upload doesn't already exist
upload$create(
    type="stream", 
    # schema is optional if update_schema is set to True on insert_rows
    schema=schema,
    # If True, will only create the upload if an upload with this name doesn't already exist
    # Otherwise, a counter will be added to the name to preserve name uniqueness          
    if_not_exists=FALSE,
    # If skip_bad_records is True, ignore records that are incompatible with the existing schema. 
    # This has no effect when update_schema is set to True on the insert_rows request.  
    skip_bad_records=FALSE # Optional, default is False
) 

insert_response <- upload$insert_rows(
    rows, 
    # If update_schema is set to True, variables can be added by subsequent streams,
    #    and variable types will be relaxed if new values are incompatible with the previous type.
    # If False, an error will be thrown if a row would cause a schema update, 
    #    unless skip_bad_records is set to True on the upload (in which case they'll be ignored)
    update_schema=FALSE,
)

# See REST API / uploads / insertRows
print(insert_response)

Release a new version

library(redivis)

dataset <- redivis$user("username")$dataset("some dataset", version="next")
dataset$release()

Create a subsequent version on an existing dataset

library(redivis)

dataset <- redivis$user("your-username")$dataset("some dataset")

# dataset$create_next_version will throw an error if a "next" version already exists,
# unless the ignore_if_exists argument is provided
dataset <- dataset$create_next_version(ignore_if_exists=TRUE)
table <- dataset$table("table name")

# By default, all new data is appended to the previous version of a table. 
# If you'd like to replace the previous data, update the upload_merge_strategy.
table$update(upload_merge_strategy="replace")

upload <- table$upload("data.csv")$create(
    "./data.csv",           # Path to file, data.frame, raw vector, etc
    # All additional params are optional; default values are shown here
    type="delimited",       # One of stream, delimited, csv, ndjson, avro, parquet, orc, xls, xlsx, dta, sas7bdat, sav
    skip_bad_records=FALSE,      
    has_header_row=TRUE,    # Only relevant for csv, xls(x)
    remove_on_fail=TRUE,    # Remove the upload if a failure occurs
    wait_for_finish=TRUE,   # Wait for the upload to finish processing
    raise_on_fail=TRUE      # Raise an error on failure
    
    # The following are only relevant for delimited files:
    allow_quoted_newlines=FALSE, # Allow newlines within cells. Setting to True will substantially reduce ingest performance.
    quote_character='"',         # The character used to escape delimiters within cells. Generally a double quote in compliant CSVs.
    delimiter=NULL,              # For delimited files, explicitly set the delimiter, otherwise the delimiter will be automatically inferred.
)

# When all uploads have finished, release the next version
dataset$release()

PreviousReading tabular data NextWorking with non-tabular data

Last updated 4 months ago

Was this helpful?