Skip to content
38 changes: 37 additions & 1 deletion examples/create_upload_tutorial.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
# * A list
# * A pandas dataframe
# * A sparse matrix
# * A pandas sparse dataframe

############################################################################
# Dataset is a numpy array
Expand Down Expand Up @@ -243,7 +244,7 @@

sparse_data = coo_matrix((
[0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0],
([0, 1, 1, 2, 2, 3, 3], [0, 1, 2, 0, 2, 0, 1]),
([0, 1, 1, 2, 2, 3, 3], [0, 1, 2, 0, 2, 0, 1])
))

column_names = [
Expand Down Expand Up @@ -273,3 +274,38 @@

upload_did = xor_dataset.publish()
print('URL for dataset: %s/data/%d' % (openml.config.server, upload_did))


############################################################################
# Dataset is a pandas sparse dataframe
# ====================================

sparse_data = coo_matrix((
[0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0],
([0, 1, 1, 2, 2, 3, 3], [0, 1, 2, 0, 2, 0, 1])
))
column_names = ['input1', 'input2', 'y']
df = pd.SparseDataFrame(sparse_data, columns=column_names)
print(df.info())

xor_dataset = create_dataset(
name="XOR",
description='Dataset representing the XOR operation',
creator=None,
contributor=None,
collection_date=None,
language='English',
licence=None,
default_target_attribute='y',
row_id_attribute=None,
ignore_attribute=None,
citation=None,
attributes='auto',
data=df,
version_label='example',
)

############################################################################

upload_did = xor_dataset.publish()
print('URL for dataset: %s/data/%d' % (openml.config.server, upload_did))
15 changes: 12 additions & 3 deletions openml/datasets/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -502,8 +502,8 @@ def create_dataset(name, description, creator, contributor,
if attributes == 'auto' or isinstance(attributes, dict):
if not hasattr(data, "columns"):
raise ValueError("Automatically inferring the attributes required "
"a pandas DataFrame. A {!r} was given instead."
.format(data))
"a pandas DataFrame or SparseDataFrame. "
"A {!r} was given instead.".format(data))
# infer the type of data for each column of the DataFrame
attributes_ = attributes_arff_from_df(data)
if isinstance(attributes, dict):
Expand All @@ -525,7 +525,16 @@ def create_dataset(name, description, creator, contributor,
.format(row_id_attribute, [attr[0] for attr in attributes_])
)

data = data.values if hasattr(data, "columns") else data
if hasattr(data, "columns"):
if isinstance(data, pd.SparseDataFrame):
data = data.to_coo()
# liac-arff only support COO matrices with sorted rows
row_idx_sorted = np.argsort(data.row)
data.row = data.row[row_idx_sorted]
data.col = data.col[row_idx_sorted]
data.data = data.data[row_idx_sorted]
else:
data = data.values

if format is not None:
warn("The format parameter will be deprecated in the future,"
Expand Down
51 changes: 51 additions & 0 deletions tests/test_datasets/test_dataset_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -411,6 +411,7 @@ def test_data_status(self):
self.assertEqual(result[did]['status'], 'active')

def test_attributes_arff_from_df(self):
# DataFrame case
df = pd.DataFrame(
[[1, 1.0, 'xxx', 'A', True], [2, 2.0, 'yyy', 'B', False]],
columns=['integer', 'floating', 'string', 'category', 'boolean']
Expand All @@ -422,6 +423,16 @@ def test_attributes_arff_from_df(self):
('string', 'STRING'),
('category', ['A', 'B']),
('boolean', ['True', 'False'])])
# SparseDataFrame case
df = pd.SparseDataFrame([[1, 1.0],
[2, 2.0],
[0, 0]],
columns=['integer', 'floating'],
default_fill_value=0)
df['integer'] = df['integer'].astype(np.int64)
attributes = attributes_arff_from_df(df)
self.assertEqual(attributes, [('integer', 'INTEGER'),
('floating', 'REAL')])

def test_attributes_arff_from_df_mixed_dtype_categories(self):
# liac-arff imposed categorical attributes to be of sting dtype. We
Expand Down Expand Up @@ -769,6 +780,46 @@ def test_create_dataset_pandas(self):
"Uploaded ARFF does not match original one"
)

# Check that SparseDataFrame are supported properly
sparse_data = scipy.sparse.coo_matrix((
[0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0],
([0, 1, 1, 2, 2, 3, 3], [0, 1, 2, 0, 2, 0, 1])
))
column_names = ['input1', 'input2', 'y']
df = pd.SparseDataFrame(sparse_data, columns=column_names)
# meta-information
description = 'Synthetic dataset created from a Pandas SparseDataFrame'
dataset = openml.datasets.functions.create_dataset(
name=name,
description=description,
creator=creator,
contributor=None,
collection_date=collection_date,
language=language,
licence=licence,
default_target_attribute=default_target_attribute,
row_id_attribute=None,
ignore_attribute=None,
citation=citation,
attributes='auto',
data=df,
format=None,
version_label='test',
original_data_url=original_data_url,
paper_url=paper_url
)
upload_did = dataset.publish()
self.assertEqual(
_get_online_dataset_arff(upload_did),
dataset._dataset,
"Uploaded ARFF does not match original one"
)
self.assertEqual(
_get_online_dataset_format(upload_did),
'sparse_arff',
"Wrong format for dataset"
)

# Check that we can overwrite the attributes
data = [['a'], ['b'], ['c'], ['d'], ['e']]
column_names = ['rnd_str']
Expand Down