diff --git a/openml/datasets/dataset.py b/openml/datasets/dataset.py index 0c9da1caf..901ee3423 100644 --- a/openml/datasets/dataset.py +++ b/openml/datasets/dataset.py @@ -13,8 +13,10 @@ import arff import numpy as np import pandas as pd +import requests import scipy.sparse import xmltodict +from tqdm import tqdm from openml.base import OpenMLBase from openml.exceptions import PyOpenMLError @@ -343,11 +345,46 @@ def __eq__(self, other: Any) -> bool: def _download_data(self) -> None: """Download ARFF data file to standard cache directory. Set `self.data_file`.""" # import required here to avoid circular import. - from .functions import _get_dataset_arff, _get_dataset_parquet - self.data_file = str(_get_dataset_arff(self)) - if self._parquet_url is not None: - self.parquet_file = str(_get_dataset_parquet(self)) + response = requests.get(self.url, stream=True) + total_size_in_bytes = int(response.headers.get("content-length", 0)) + if total_size_in_bytes == 0: + logger.warning( + "Could not retrieve the content length from the header, the progress bar will not be shown." + ) + else: + progress_bar = tqdm(total=total_size_in_bytes, unit="iB", unit_scale=True) + try: + with open(self.data_file, "wb") as file: + for data in response.iter_content(1024): + progress_bar.update(len(data)) + file.write(data) + finally: + progress_bar.close() + if progress_bar.n != total_size_in_bytes: + logger.error("ERROR, something went wrong with downloading the file") + + # if self.url is None: + # logger.error("No URL set for downloading dataset.") + # return + + # # Assuming self.url is directly usable for the download + # response = requests.get(self.url, stream=True) + # total_size_in_bytes= int(response.headers.get('content-length', 0)) + # progress_bar = tqdm(total=total_size_in_bytes, unit='iB', unit_scale=True) + + # with open(self.data_file, 'wb') as file: + # for data in response.iter_content(1024): + # progress_bar.update(len(data)) + # file.write(data) + # progress_bar.close() + + # if total_size_in_bytes != 0 and progress_bar.n != total_size_in_bytes: + # logger.error("ERROR, something went wrong with downloading the file") + + # self.data_file = str(_get_dataset_arff(self)) + # if self._parquet_url is not None: + # self.parquet_file = str(_get_dataset_parquet(self)) def _get_arff(self, format: str) -> dict: # noqa: A002 """Read ARFF file and return decoded arff. @@ -408,7 +445,7 @@ def decode_arff(fh: Any) -> dict: with filepath.open(encoding="utf8") as fh: return decode_arff(fh) - def _parse_data_from_arff( # noqa: C901, PLR0912, PLR0915 + def _parse_data_from_arff( self, arff_file_path: Path, ) -> tuple[pd.DataFrame | scipy.sparse.csr_matrix, list[bool], list[str]]: @@ -427,92 +464,136 @@ def _parse_data_from_arff( # noqa: C901, PLR0912, PLR0915 List[str]: List of column names. """ try: - data = self._get_arff(self.format) - except OSError as e: - logger.critical( - f"Please check that the data file {arff_file_path} is " "there and can be read.", - ) - raise e - - ARFF_DTYPES_TO_PD_DTYPE = { - "INTEGER": "integer", - "REAL": "floating", - "NUMERIC": "floating", - "STRING": "string", - } - attribute_dtype = {} - attribute_names = [] - categories_names = {} - categorical = [] - for name, type_ in data["attributes"]: - # if the feature is nominal and a sparse matrix is - # requested, the categories need to be numeric - if isinstance(type_, list) and self.format.lower() == "sparse_arff": - try: - # checks if the strings which should be the class labels - # can be encoded into integers - pd.factorize(type_)[0] - except ValueError as e: - raise ValueError( - "Categorical data needs to be numeric when using sparse ARFF." - ) from e - - # string can only be supported with pandas DataFrame - elif type_ == "STRING" and self.format.lower() == "sparse_arff": - raise ValueError("Dataset containing strings is not supported with sparse ARFF.") - - # infer the dtype from the ARFF header - if isinstance(type_, list): - categorical.append(True) - categories_names[name] = type_ - if len(type_) == 2: - type_norm = [cat.lower().capitalize() for cat in type_] - if {"True", "False"} == set(type_norm): - categories_names[name] = [cat == "True" for cat in type_norm] - attribute_dtype[name] = "boolean" - else: - attribute_dtype[name] = "categorical" - else: - attribute_dtype[name] = "categorical" + file_size = arff_file_path.stat().st_size + progress_bar = tqdm.tqdm(total=file_size, unit="iB", unit_scale=True) + + with open(arff_file_path, encoding="utf8") as file: + data = [] + attributes = [] + data_started = False + for line in file: + progress_bar.update(len(line.encode("utf-8"))) # Update based on bytes read + line = line.strip() + if line.startswith("@data"): + data_started = True + continue + elif line.startswith("@attribute"): + attributes.append(line) + continue + + if data_started and line: + data.append(line.split(",")) + + progress_bar.close() + + # Convert the parsed data into a DataFrame or a sparse matrix as needed + df = pd.DataFrame(data, columns=[attr.split(" ")[1] for attr in attributes]) + categorical = ["nominal" in attr for attr in attributes] + attribute_names = [attr.split(" ")[1] for attr in attributes] + + # Convert strings to numeric values if required + for column in df.columns[categorical]: + df[column] = pd.Categorical(df[column]).codes + + if self.format.lower() == "sparse_arff": + # Convert DataFrame to a sparse matrix + from scipy.sparse import csr_matrix + + matrix = csr_matrix(df.values) + return matrix, categorical, attribute_names else: - categorical.append(False) - attribute_dtype[name] = ARFF_DTYPES_TO_PD_DTYPE[type_] - attribute_names.append(name) - - if self.format.lower() == "sparse_arff": - X = data["data"] - X_shape = (max(X[1]) + 1, max(X[2]) + 1) - X = scipy.sparse.coo_matrix((X[0], (X[1], X[2])), shape=X_shape, dtype=np.float32) - X = X.tocsr() - elif self.format.lower() == "arff": - X = pd.DataFrame(data["data"], columns=attribute_names) - - col = [] - for column_name in X.columns: - if attribute_dtype[column_name] in ("categorical", "boolean"): - categories = self._unpack_categories( - X[column_name], # type: ignore - categories_names[column_name], - ) - col.append(categories) - elif attribute_dtype[column_name] in ("floating", "integer"): - X_col = X[column_name] - if X_col.min() >= 0 and X_col.max() <= 255: - try: - X_col_uint = X_col.astype("uint8") - if (X_col == X_col_uint).all(): - col.append(X_col_uint) - continue - except ValueError: - pass - col.append(X[column_name]) - else: - col.append(X[column_name]) - X = pd.concat(col, axis=1) - else: - raise ValueError(f"Dataset format '{self.format}' is not a valid format.") + return df, categorical, attribute_names - return X, categorical, attribute_names # type: ignore + except OSError as e: + logger.critical(f"Cannot read {arff_file_path}.") + raise e + # try: + # data = self._get_arff(self.format) + # except OSError as e: + # logger.critical( + # f"Please check that the data file {arff_file_path} is " "there and can be read.", + # ) + # raise e + + # ARFF_DTYPES_TO_PD_DTYPE = { + # "INTEGER": "integer", + # "REAL": "floating", + # "NUMERIC": "floating", + # "STRING": "string", + # } + # attribute_dtype = {} + # attribute_names = [] + # categories_names = {} + # categorical = [] + # for name, type_ in data["attributes"]: + # # if the feature is nominal and a sparse matrix is + # # requested, the categories need to be numeric + # if isinstance(type_, list) and self.format.lower() == "sparse_arff": + # try: + # # checks if the strings which should be the class labels + # # can be encoded into integers + # pd.factorize(type_)[0] + # except ValueError as e: + # raise ValueError( + # "Categorical data needs to be numeric when using sparse ARFF." + # ) from e + + # # string can only be supported with pandas DataFrame + # elif type_ == "STRING" and self.format.lower() == "sparse_arff": + # raise ValueError("Dataset containing strings is not supported with sparse ARFF.") + + # # infer the dtype from the ARFF header + # if isinstance(type_, list): + # categorical.append(True) + # categories_names[name] = type_ + # if len(type_) == 2: + # type_norm = [cat.lower().capitalize() for cat in type_] + # if {"True", "False"} == set(type_norm): + # categories_names[name] = [cat == "True" for cat in type_norm] + # attribute_dtype[name] = "boolean" + # else: + # attribute_dtype[name] = "categorical" + # else: + # attribute_dtype[name] = "categorical" + # else: + # categorical.append(False) + # attribute_dtype[name] = ARFF_DTYPES_TO_PD_DTYPE[type_] + # attribute_names.append(name) + + # if self.format.lower() == "sparse_arff": + # X = data["data"] + # X_shape = (max(X[1]) + 1, max(X[2]) + 1) + # X = scipy.sparse.coo_matrix((X[0], (X[1], X[2])), shape=X_shape, dtype=np.float32) + # X = X.tocsr() + # elif self.format.lower() == "arff": + # X = pd.DataFrame(data["data"], columns=attribute_names) + + # col = [] + # for column_name in X.columns: + # if attribute_dtype[column_name] in ("categorical", "boolean"): + # categories = self._unpack_categories( + # X[column_name], # type: ignore + # categories_names[column_name], + # ) + # col.append(categories) + # elif attribute_dtype[column_name] in ("floating", "integer"): + # X_col = X[column_name] + # if X_col.min() >= 0 and X_col.max() <= 255: + # try: + # X_col_uint = X_col.astype("uint8") + # if (X_col == X_col_uint).all(): + # col.append(X_col_uint) + # continue + # except ValueError: + # pass + # col.append(X[column_name]) + # else: + # col.append(X[column_name]) + # X = pd.concat(col, axis=1) + # else: + # raise ValueError(f"Dataset format '{self.format}' is not a valid format.") + + # return X, categorical, attribute_names # type: ignore def _compressed_cache_file_paths(self, data_file: Path) -> tuple[Path, Path, Path]: data_pickle_file = data_file.with_suffix(".pkl.py3") diff --git a/openml/run_openml.py b/openml/run_openml.py new file mode 100644 index 000000000..3814953bf --- /dev/null +++ b/openml/run_openml.py @@ -0,0 +1,41 @@ +from __future__ import annotations + +from openml import datasets + +# Fetch the dataset with ID 1471 (Iris dataset, as an example) +openml_dataset = datasets.get_dataset( + 1471, download_data=True, download_qualities=True, download_features_meta_data=True +) + + +# Get the data (X features, y target) from the dataset +X, y, _, _ = openml_dataset.get_data(target=openml_dataset.default_target_attribute) + +# Assuming X and y are loaded as per your script +from sklearn.ensemble import RandomForestClassifier +from sklearn.metrics import accuracy_score +from sklearn.model_selection import train_test_split + +# Explore the data + +# Split the data +X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) + +# Initialize a classifier (example: RandomForest) +clf = RandomForestClassifier() + +# Train the classifier +clf.fit(X_train, y_train) + +# Make predictions +y_pred = clf.predict(X_test) + +# Evaluate the classifier +accuracy = accuracy_score(y_test, y_pred) + +import time + +from tqdm import tqdm + +for _i in tqdm(range(10)): + time.sleep(0.5)