Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
156 changes: 156 additions & 0 deletions examples/Dataset_import.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,156 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import arff\n",
"import numpy as np\n",
"import openml\n",
"import sklearn.datasets"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"# For this example we will upload to the test server to not\n",
"# pollute the live server with countless copies of the same\n",
"# dataset\n",
"openml.config.server = 'https://test.openml.org/api/v1/xml'"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"# Load an example dataset from scikit-learn which we will \n",
"# upload to OpenML.org via the API\n",
"breast_cancer = sklearn.datasets.load_breast_cancer()\n",
"name = 'BreastCancer(scikit-learn)'\n",
"X = breast_cancer.data\n",
"y = breast_cancer.target\n",
"attribute_names = breast_cancer.feature_names\n",
"targets = breast_cancer.target_names\n",
"description = breast_cancer.DESCR"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"# OpenML does not distinguish between the attributes and\n",
"# targets on the data level and stores all data in a \n",
"# single matrix. The target feature is indicated as \n",
"# meta-data of the dataset (and tasks on that data)\n",
"data = np.concatenate((X, y.reshape((-1, 1))), axis=1)\n",
"attribute_names = list(attribute_names)\n",
"attributes = [\n",
" (attribute_name, 'REAL') for attribute_name in attribute_names\n",
"] + [('class', 'REAL')]"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"# Create the dataset object. \n",
"# The definition of all fields can be found in the XSD files\n",
"# describing the expected format:\n",
"# https://github.com/openml/OpenML/blob/master/openml_OS/views/pages/api_new/v1/xsd/openml.data.upload.xsd\n",
"dataset = openml.datasets.functions.create_dataset(\n",
" # The name of the dataset (needs to be unique). \n",
" # Must not be longer than 128 characters and only contain\n",
" # a-z, A-Z, 0-9 and the following special characters: _\\-\\.(),\n",
" name=name,\n",
" # Textual description of the dataset.\n",
" description=description,\n",
" # The person who created the dataset.\n",
" creator='Dr. William H. Wolberg, W. Nick Street, Olvi L. Mangasarian',\n",
" # People who contributed to the current version of the dataset.\n",
" contributor=None,\n",
" # The date the data was originally collected, given by the uploader.\n",
" collection_date='01-11-1995',\n",
" # Language in which the data is represented.\n",
" # Starts with 1 upper case letter, rest lower case, e.g. 'English'.\n",
" language='English',\n",
" # License under which the data is/will be distributed.\n",
" licence='BSD (from scikit-learn)',\n",
" # Name of the target. Can also have multiple values (comma-separated).\n",
" default_target_attribute='class',\n",
" # The attribute that represents the row-id column, if present in the dataset.\n",
" row_id_attribute=None,\n",
" # Attributes that should be excluded in modelling, such as identifiers and indexes.\n",
" ignore_attribute=None,\n",
" # How to cite the paper.\n",
" citation=(\n",
" \"W.N. Street, W.H. Wolberg and O.L. Mangasarian. \"\n",
" \"Nuclear feature extraction for breast tumor diagnosis. \"\n",
" \"IS&T/SPIE 1993 International Symposium on Electronic Imaging: Science and Technology, \"\n",
" \"volume 1905, pages 861-870, San Jose, CA, 1993.\"\n",
" ),\n",
" # Attributes of the data\n",
" attributes=attributes,\n",
" data=data,\n",
" # Format of the dataset. Only 'arff' for now.\n",
" format='arff',\n",
" # A version label which is provided by the user.\n",
" version_label='test',\n",
" original_data_url='https://archive.ics.uci.edu/ml/datasets/Breast+Cancer+Wisconsin+(Diagnostic)',\n",
" paper_url='https://www.spiedigitallibrary.org/conference-proceedings-of-spie/1905/0000/Nuclear-feature-extraction-for-breast-tumor-diagnosis/10.1117/12.148698.short?SSO=1'\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"scrolled": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"231\n"
]
}
],
"source": [
"upload_id = dataset.publish()\n",
"print(upload_id)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python OpenMl",
"language": "python",
"name": "openml3.6"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.4"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
36 changes: 7 additions & 29 deletions openml/_api_calls.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,8 @@
OpenMLServerNoResult)


def _perform_api_call(call, data=None, file_dictionary=None,
file_elements=None, add_authentication=True):
def _perform_api_call(call, data=None, file_elements=None,
add_authentication=True):
"""
Perform an API call at the OpenML server.
return self._read_url(url, data=data, filePath=filePath,
Expand All @@ -24,9 +24,6 @@ def _read_url(self, url, add_authentication=False, data=None, filePath=None):
The API call. For example data/list
data : dict
Dictionary with post-request payload.
file_dictionary : dict
Mapping of {filename: path} of files which should be uploaded to the
server.
file_elements : dict
Mapping of {filename: str} of strings which should be uploaded as
files to the server.
Expand All @@ -47,9 +44,8 @@ def _read_url(self, url, add_authentication=False, data=None, filePath=None):

url = url.replace('=', '%3d')

if file_dictionary is not None or file_elements is not None:
return _read_url_files(url, data=data, file_dictionary=file_dictionary,
file_elements=file_elements)
if file_elements is not None:
return _read_url_files(url, data=data, file_elements=file_elements)
return _read_url(url, data)


Expand All @@ -65,32 +61,14 @@ def _file_id_to_url(file_id, filename=None):
return url


def _read_url_files(url, data=None, file_dictionary=None, file_elements=None):
"""do a post request to url with data, file content of
file_dictionary and sending file_elements as files"""
def _read_url_files(url, data=None, file_elements=None):
"""do a post request to url with data
and sending file_elements as files"""

data = {} if data is None else data
data['api_key'] = config.apikey
if file_elements is None:
file_elements = {}
if file_dictionary is not None:
for key, path in file_dictionary.items():
path = os.path.abspath(path)
if os.path.exists(path):
try:
if key is 'dataset':
# check if arff is valid?
decoder = arff.ArffDecoder()
with io.open(path, encoding='utf8') as fh:
decoder.decode(fh, encode_nominal=True)
except:
raise ValueError("The file you have provided is not a valid arff file")

file_elements[key] = open(path, 'rb')

else:
raise ValueError("File doesn't exist")

# Using requests.post sets header 'Accept-encoding' automatically to
# 'gzip,deflate'
response = requests.post(url, data=data, files=file_elements)
Expand Down
Loading