Skip to content

Commit 5b1eb29

Browse files
mfeurerArlindKadra
authored andcommitted
[WIP] Fix and improve dataset upload (#440)
* Bug fixes when uploading datasets, removed unnecessary variable from test method. * Added create dataset function in datasets/functions. * Refactored OpenMLDataset. * Refactored _api_calls. * Made the necessary changes to the dataset tutorial. Added the tutorial in the unit tests.
1 parent 805059d commit 5b1eb29

File tree

6 files changed

+365
-59
lines changed

6 files changed

+365
-59
lines changed

examples/Dataset_import.ipynb

Lines changed: 156 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,156 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "code",
5+
"execution_count": 1,
6+
"metadata": {},
7+
"outputs": [],
8+
"source": [
9+
"import arff\n",
10+
"import numpy as np\n",
11+
"import openml\n",
12+
"import sklearn.datasets"
13+
]
14+
},
15+
{
16+
"cell_type": "code",
17+
"execution_count": 2,
18+
"metadata": {},
19+
"outputs": [],
20+
"source": [
21+
"# For this example we will upload to the test server to not\n",
22+
"# pollute the live server with countless copies of the same\n",
23+
"# dataset\n",
24+
"openml.config.server = 'https://test.openml.org/api/v1/xml'"
25+
]
26+
},
27+
{
28+
"cell_type": "code",
29+
"execution_count": 3,
30+
"metadata": {},
31+
"outputs": [],
32+
"source": [
33+
"# Load an example dataset from scikit-learn which we will \n",
34+
"# upload to OpenML.org via the API\n",
35+
"breast_cancer = sklearn.datasets.load_breast_cancer()\n",
36+
"name = 'BreastCancer(scikit-learn)'\n",
37+
"X = breast_cancer.data\n",
38+
"y = breast_cancer.target\n",
39+
"attribute_names = breast_cancer.feature_names\n",
40+
"targets = breast_cancer.target_names\n",
41+
"description = breast_cancer.DESCR"
42+
]
43+
},
44+
{
45+
"cell_type": "code",
46+
"execution_count": 4,
47+
"metadata": {},
48+
"outputs": [],
49+
"source": [
50+
"# OpenML does not distinguish between the attributes and\n",
51+
"# targets on the data level and stores all data in a \n",
52+
"# single matrix. The target feature is indicated as \n",
53+
"# meta-data of the dataset (and tasks on that data)\n",
54+
"data = np.concatenate((X, y.reshape((-1, 1))), axis=1)\n",
55+
"attribute_names = list(attribute_names)\n",
56+
"attributes = [\n",
57+
" (attribute_name, 'REAL') for attribute_name in attribute_names\n",
58+
"] + [('class', 'REAL')]"
59+
]
60+
},
61+
{
62+
"cell_type": "code",
63+
"execution_count": 5,
64+
"metadata": {},
65+
"outputs": [],
66+
"source": [
67+
"# Create the dataset object. \n",
68+
"# The definition of all fields can be found in the XSD files\n",
69+
"# describing the expected format:\n",
70+
"# https://github.com/openml/OpenML/blob/master/openml_OS/views/pages/api_new/v1/xsd/openml.data.upload.xsd\n",
71+
"dataset = openml.datasets.functions.create_dataset(\n",
72+
" # The name of the dataset (needs to be unique). \n",
73+
" # Must not be longer than 128 characters and only contain\n",
74+
" # a-z, A-Z, 0-9 and the following special characters: _\\-\\.(),\n",
75+
" name=name,\n",
76+
" # Textual description of the dataset.\n",
77+
" description=description,\n",
78+
" # The person who created the dataset.\n",
79+
" creator='Dr. William H. Wolberg, W. Nick Street, Olvi L. Mangasarian',\n",
80+
" # People who contributed to the current version of the dataset.\n",
81+
" contributor=None,\n",
82+
" # The date the data was originally collected, given by the uploader.\n",
83+
" collection_date='01-11-1995',\n",
84+
" # Language in which the data is represented.\n",
85+
" # Starts with 1 upper case letter, rest lower case, e.g. 'English'.\n",
86+
" language='English',\n",
87+
" # License under which the data is/will be distributed.\n",
88+
" licence='BSD (from scikit-learn)',\n",
89+
" # Name of the target. Can also have multiple values (comma-separated).\n",
90+
" default_target_attribute='class',\n",
91+
" # The attribute that represents the row-id column, if present in the dataset.\n",
92+
" row_id_attribute=None,\n",
93+
" # Attributes that should be excluded in modelling, such as identifiers and indexes.\n",
94+
" ignore_attribute=None,\n",
95+
" # How to cite the paper.\n",
96+
" citation=(\n",
97+
" \"W.N. Street, W.H. Wolberg and O.L. Mangasarian. \"\n",
98+
" \"Nuclear feature extraction for breast tumor diagnosis. \"\n",
99+
" \"IS&T/SPIE 1993 International Symposium on Electronic Imaging: Science and Technology, \"\n",
100+
" \"volume 1905, pages 861-870, San Jose, CA, 1993.\"\n",
101+
" ),\n",
102+
" # Attributes of the data\n",
103+
" attributes=attributes,\n",
104+
" data=data,\n",
105+
" # Format of the dataset. Only 'arff' for now.\n",
106+
" format='arff',\n",
107+
" # A version label which is provided by the user.\n",
108+
" version_label='test',\n",
109+
" original_data_url='https://archive.ics.uci.edu/ml/datasets/Breast+Cancer+Wisconsin+(Diagnostic)',\n",
110+
" paper_url='https://www.spiedigitallibrary.org/conference-proceedings-of-spie/1905/0000/Nuclear-feature-extraction-for-breast-tumor-diagnosis/10.1117/12.148698.short?SSO=1'\n",
111+
")"
112+
]
113+
},
114+
{
115+
"cell_type": "code",
116+
"execution_count": 6,
117+
"metadata": {
118+
"scrolled": false
119+
},
120+
"outputs": [
121+
{
122+
"name": "stdout",
123+
"output_type": "stream",
124+
"text": [
125+
"231\n"
126+
]
127+
}
128+
],
129+
"source": [
130+
"upload_id = dataset.publish()\n",
131+
"print(upload_id)"
132+
]
133+
}
134+
],
135+
"metadata": {
136+
"kernelspec": {
137+
"display_name": "Python OpenMl",
138+
"language": "python",
139+
"name": "openml3.6"
140+
},
141+
"language_info": {
142+
"codemirror_mode": {
143+
"name": "ipython",
144+
"version": 3
145+
},
146+
"file_extension": ".py",
147+
"mimetype": "text/x-python",
148+
"name": "python",
149+
"nbconvert_exporter": "python",
150+
"pygments_lexer": "ipython3",
151+
"version": "3.6.4"
152+
}
153+
},
154+
"nbformat": 4,
155+
"nbformat_minor": 2
156+
}

openml/_api_calls.py

Lines changed: 7 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,8 @@
1111
OpenMLServerNoResult)
1212

1313

14-
def _perform_api_call(call, data=None, file_dictionary=None,
15-
file_elements=None, add_authentication=True):
14+
def _perform_api_call(call, data=None, file_elements=None,
15+
add_authentication=True):
1616
"""
1717
Perform an API call at the OpenML server.
1818
return self._read_url(url, data=data, filePath=filePath,
@@ -24,9 +24,6 @@ def _read_url(self, url, add_authentication=False, data=None, filePath=None):
2424
The API call. For example data/list
2525
data : dict
2626
Dictionary with post-request payload.
27-
file_dictionary : dict
28-
Mapping of {filename: path} of files which should be uploaded to the
29-
server.
3027
file_elements : dict
3128
Mapping of {filename: str} of strings which should be uploaded as
3229
files to the server.
@@ -47,9 +44,8 @@ def _read_url(self, url, add_authentication=False, data=None, filePath=None):
4744

4845
url = url.replace('=', '%3d')
4946

50-
if file_dictionary is not None or file_elements is not None:
51-
return _read_url_files(url, data=data, file_dictionary=file_dictionary,
52-
file_elements=file_elements)
47+
if file_elements is not None:
48+
return _read_url_files(url, data=data, file_elements=file_elements)
5349
return _read_url(url, data)
5450

5551

@@ -65,32 +61,14 @@ def _file_id_to_url(file_id, filename=None):
6561
return url
6662

6763

68-
def _read_url_files(url, data=None, file_dictionary=None, file_elements=None):
69-
"""do a post request to url with data, file content of
70-
file_dictionary and sending file_elements as files"""
64+
def _read_url_files(url, data=None, file_elements=None):
65+
"""do a post request to url with data
66+
and sending file_elements as files"""
7167

7268
data = {} if data is None else data
7369
data['api_key'] = config.apikey
7470
if file_elements is None:
7571
file_elements = {}
76-
if file_dictionary is not None:
77-
for key, path in file_dictionary.items():
78-
path = os.path.abspath(path)
79-
if os.path.exists(path):
80-
try:
81-
if key is 'dataset':
82-
# check if arff is valid?
83-
decoder = arff.ArffDecoder()
84-
with io.open(path, encoding='utf8') as fh:
85-
decoder.decode(fh, encode_nominal=True)
86-
except:
87-
raise ValueError("The file you have provided is not a valid arff file")
88-
89-
file_elements[key] = open(path, 'rb')
90-
91-
else:
92-
raise ValueError("File doesn't exist")
93-
9472
# Using requests.post sets header 'Accept-encoding' automatically to
9573
# 'gzip,deflate'
9674
response = requests.post(url, data=data, files=file_elements)

0 commit comments

Comments
 (0)