openml · ArlindKadra · Jun 3, 2018 · Apr 10, 2018 · Apr 13, 2018 · May 9, 2018
diff --git a/examples/Dataset_import.ipynb b/examples/Dataset_import.ipynb
@@ -0,0 +1,156 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import arff\n",
+    "import numpy as np\n",
+    "import openml\n",
+    "import sklearn.datasets"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# For this example we will upload to the test server to not\n",
+    "# pollute the live server with countless copies of the same\n",
+    "# dataset\n",
+    "openml.config.server = 'https://test.openml.org/api/v1/xml'"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Load an example dataset from scikit-learn which we will \n",
+    "# upload to OpenML.org via the API\n",
+    "breast_cancer = sklearn.datasets.load_breast_cancer()\n",
+    "name = 'BreastCancer(scikit-learn)'\n",
+    "X = breast_cancer.data\n",
+    "y = breast_cancer.target\n",
+    "attribute_names = breast_cancer.feature_names\n",
+    "targets = breast_cancer.target_names\n",
+    "description = breast_cancer.DESCR"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# OpenML does not distinguish between the attributes and\n",
+    "# targets on the data level and stores all data in a \n",
+    "# single matrix. The target feature is indicated as \n",
+    "# meta-data of the dataset (and tasks on that data)\n",
+    "data = np.concatenate((X, y.reshape((-1, 1))), axis=1)\n",
+    "attribute_names = list(attribute_names)\n",
+    "attributes = [\n",
+    "    (attribute_name, 'REAL') for attribute_name in attribute_names\n",
+    "] + [('class', 'REAL')]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Create the dataset object. \n",
+    "# The definition of all fields can be found in the XSD files\n",
+    "# describing the expected format:\n",
+    "# https://github.com/openml/OpenML/blob/master/openml_OS/views/pages/api_new/v1/xsd/openml.data.upload.xsd\n",
+    "dataset = openml.datasets.functions.create_dataset(\n",
+    "    # The name of the dataset (needs to be unique). \n",
+    "    # Must not be longer than 128 characters and only contain\n",
+    "    # a-z, A-Z, 0-9 and the following special characters: _\\-\\.(),\n",
+    "    name=name,\n",
+    "    # Textual description of the dataset.\n",
+    "    description=description,\n",
+    "    # The person who created the dataset.\n",
+    "    creator='Dr. William H. Wolberg, W. Nick Street, Olvi L. Mangasarian',\n",
+    "    # People who contributed to the current version of the dataset.\n",
+    "    contributor=None,\n",
+    "    # The date the data was originally collected, given by the uploader.\n",
+    "    collection_date='01-11-1995',\n",
+    "    # Language in which the data is represented.\n",
+    "    # Starts with 1 upper case letter, rest lower case, e.g. 'English'.\n",
+    "    language='English',\n",
+    "    # License under which the data is/will be distributed.\n",
+    "    licence='BSD (from scikit-learn)',\n",
+    "    # Name of the target. Can also have multiple values (comma-separated).\n",
+    "    default_target_attribute='class',\n",
+    "    # The attribute that represents the row-id column, if present in the dataset.\n",
+    "    row_id_attribute=None,\n",
+    "    # Attributes that should be excluded in modelling, such as identifiers and indexes.\n",
+    "    ignore_attribute=None,\n",
+    "    # How to cite the paper.\n",
+    "    citation=(\n",
+    "        \"W.N. Street, W.H. Wolberg and O.L. Mangasarian. \"\n",
+    "        \"Nuclear feature extraction for breast tumor diagnosis. \"\n",
+    "        \"IS&T/SPIE 1993 International Symposium on Electronic Imaging: Science and Technology, \"\n",
+    "        \"volume 1905, pages 861-870, San Jose, CA, 1993.\"\n",
+    "    ),\n",
+    "    # Attributes of the data\n",
+    "    attributes=attributes,\n",
+    "    data=data,\n",
+    "    # Format of the dataset. Only 'arff' for now.\n",
+    "    format='arff',\n",
+    "    # A version label which is provided by the user.\n",
+    "    version_label='test',\n",
+    "    original_data_url='https://archive.ics.uci.edu/ml/datasets/Breast+Cancer+Wisconsin+(Diagnostic)',\n",
+    "    paper_url='https://www.spiedigitallibrary.org/conference-proceedings-of-spie/1905/0000/Nuclear-feature-extraction-for-breast-tumor-diagnosis/10.1117/12.148698.short?SSO=1'\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {
+    "scrolled": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "231\n"
+     ]
+    }
+   ],
+   "source": [
+    "upload_id = dataset.publish()\n",
+    "print(upload_id)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python OpenMl",
+   "language": "python",
+   "name": "openml3.6"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.4"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/openml/_api_calls.py b/openml/_api_calls.py
@@ -11,8 +11,8 @@
                          OpenMLServerNoResult)
 
 
-def _perform_api_call(call, data=None, file_dictionary=None,
-                      file_elements=None, add_authentication=True):
+def _perform_api_call(call, data=None, file_elements=None,
+                      add_authentication=True):
     """
     Perform an API call at the OpenML server.
     return self._read_url(url, data=data, filePath=filePath,
@@ -24,9 +24,6 @@ def _read_url(self, url, add_authentication=False, data=None, filePath=None):
         The API call. For example data/list
     data : dict
         Dictionary with post-request payload.
-    file_dictionary : dict
-        Mapping of {filename: path} of files which should be uploaded to the
-        server.
     file_elements : dict
         Mapping of {filename: str} of strings which should be uploaded as
         files to the server.
@@ -47,9 +44,8 @@ def _read_url(self, url, add_authentication=False, data=None, filePath=None):
 
     url = url.replace('=', '%3d')
 
-    if file_dictionary is not None or file_elements is not None:
-        return _read_url_files(url, data=data, file_dictionary=file_dictionary,
-                               file_elements=file_elements)
+    if file_elements is not None:
+        return _read_url_files(url, data=data, file_elements=file_elements)
     return _read_url(url, data)
 
 
@@ -65,32 +61,14 @@ def _file_id_to_url(file_id, filename=None):
     return url
 
 
-def _read_url_files(url, data=None, file_dictionary=None, file_elements=None):
-    """do a post request to url with data, file content of
-    file_dictionary and sending file_elements as files"""
+def _read_url_files(url, data=None, file_elements=None):
+    """do a post request to url with data
+    and sending file_elements as files"""
 
     data = {} if data is None else data
     data['api_key'] = config.apikey
     if file_elements is None:
         file_elements = {}
-    if file_dictionary is not None:
-        for key, path in file_dictionary.items():
-            path = os.path.abspath(path)
-            if os.path.exists(path):
-                try:
-                    if key is 'dataset':
-                        # check if arff is valid?
-                        decoder = arff.ArffDecoder()
-                        with io.open(path, encoding='utf8') as fh:
-                            decoder.decode(fh, encode_nominal=True)
-                except:
-                    raise ValueError("The file you have provided is not a valid arff file")
-
-                file_elements[key] = open(path, 'rb')
-
-            else:
-                raise ValueError("File doesn't exist")
-
     # Using requests.post sets header 'Accept-encoding' automatically to
     # 'gzip,deflate'
     response = requests.post(url, data=data, files=file_elements)