fix: type-def finished for datasets

Bilgecelik · Bilgecelik · commit fc084fdc24e1 · 2023-06-14T15:31:56.000+03:00
diff --git a/openml/base.py b/openml/base.py
@@ -46,7 +46,7 @@ def _entity_letter(cls) -> str:
         return cls.__name__.lower()[len("OpenML") :][0]
 
     @abstractmethod
-    def _get_repr_body_fields(self) -> List[Tuple[str, Union[str, int, List[str]]]]:
+    def _get_repr_body_fields(self) -> List[Tuple[str, Union[str, int, List[str], None]]]:
         """Collect all information to display in the __repr__ body.
 
         Returns
diff --git a/openml/datasets/dataset.py b/openml/datasets/dataset.py
@@ -118,8 +118,8 @@ def __init__(
         description: str,
         data_format: str = "arff",
         cache_format: str = "pickle",
-        dataset_id: Optional[int] = None,
-        version: Optional[int] = None,
+        dataset_id: Optional[str] = None,
+        version: Optional[str] = None,
         creator: Optional[str] = None,
         contributor: Optional[str] = None,
         collection_date: Optional[str] = None,
@@ -129,7 +129,7 @@ def __init__(
         url: Optional[str] = None,
         default_target_attribute: Optional[str] = None,
         row_id_attribute: Optional[str] = None,
-        ignore_attribute: Optional[str] = None,
+        ignore_attribute: Optional[Union[List[str], str]] = None,
         version_label: Optional[str] = None,
         citation: Optional[str] = None,
         tag: Optional[str] = None,
diff --git a/openml/datasets/functions.py b/openml/datasets/functions.py
@@ -473,8 +473,6 @@ def get_dataset(
         dataset = _create_dataset_from_description(
             description, features_file, qualities_file, arff_file, parquet_file, cache_format
         )
-    else:
-        dataset = None
     return dataset
 
 
@@ -994,7 +992,7 @@ def _get_dataset_description(did_cache_dir: str, dataset_id: int) -> Dict[str, s
 
 
 def _get_dataset_parquet(
-    description: Union[Dict[str, Union[str, int]], OpenMLDataset],
+    description: Union[Dict[str, str], OpenMLDataset],
     cache_directory: Optional[str] = None,
     download_all_files: bool = False,
 ) -> Optional[str]:
@@ -1025,12 +1023,12 @@ def _get_dataset_parquet(
     output_filename : string, optional
         Location of the Parquet file if successfully downloaded, None otherwise.
     """
-    if isinstance(description, dict):
-        url = cast(str, description.get("oml:minio_url"))
-        did = description.get("oml:id")
-    elif isinstance(description, OpenMLDataset):
+    if isinstance(description, OpenMLDataset):
         url = cast(str, description._minio_url)
         did = description.dataset_id
+    elif isinstance(description, dict):
+        url = cast(str, description.get("oml:minio_url"))
+        did = int(description.get("oml:id", ""))
     else:
         raise TypeError("`description` should be either OpenMLDataset or Dict.")
 
@@ -1063,7 +1061,7 @@ def _get_dataset_parquet(
 
 
 def _get_dataset_arff(
-    description: Union[Dict[str, Union[str, int]], OpenMLDataset],
+    description: Union[Dict[str, str], OpenMLDataset],
     cache_directory: Optional[str] = None,
 ) -> str:
     """Return the path to the local arff file of the dataset. If is not cached, it is downloaded.
@@ -1088,14 +1086,14 @@ def _get_dataset_arff(
     output_filename : string
         Location of ARFF file.
     """
-    if isinstance(description, dict):
-        md5_checksum_fixture = description.get("oml:md5_checksum")
-        url = description["oml:url"]
-        did = description.get("oml:id")
-    elif isinstance(description, OpenMLDataset):
+    if isinstance(description, OpenMLDataset):
         md5_checksum_fixture = description.md5_checksum
-        url = description.url
+        url = cast(str, description.url)
         did = description.dataset_id
+    elif isinstance(description, dict):
+        md5_checksum_fixture = description.get("oml:md5_checksum")
+        url = cast(str, description["oml:url"])
+        did = int(description.get("oml:id", ""))
     else:
         raise TypeError("`description` should be either OpenMLDataset or Dict.")
 
@@ -1214,8 +1212,8 @@ def _create_dataset_from_description(
         Dataset object from dict and ARFF.
     """
     return OpenMLDataset(
-        description["oml:name"],
-        description.get("oml:description"),
+        name=description["oml:name"],
+        description=description.get("oml:description", ""),
         data_format=description["oml:format"],
         dataset_id=description["oml:id"],
         version=description["oml:version"],
@@ -1246,7 +1244,7 @@ def _create_dataset_from_description(
     )
 
 
-def _get_online_dataset_arff(dataset_id: int) -> str:
+def _get_online_dataset_arff(dataset_id: int) -> Optional[str]:
     """Download the ARFF file for a given dataset id
     from the OpenML website.
 
diff --git a/openml/flows/flow.py b/openml/flows/flow.py
@@ -173,7 +173,7 @@ def extension(self):
                 "No extension could be found for flow {}: {}".format(self.flow_id, self.name)
             )
 
-    def _get_repr_body_fields(self) -> List[Tuple[str, Union[str, int, List[str]]]]:
+    def _get_repr_body_fields(self) -> List[Tuple[str, Union[str, int, List[str], None]]]:
         """Collect all information to display in the __repr__ body."""
         fields = {
             "Flow Name": self.name,
diff --git a/openml/runs/run.py b/openml/runs/run.py
@@ -189,7 +189,7 @@ def _evaluation_summary(self, metric: str) -> str:
 
         return "{:.4f} +- {:.4f}".format(np.mean(rep_means), np.mean(rep_stds))
 
-    def _get_repr_body_fields(self) -> List[Tuple[str, Union[str, int, List[str]]]]:
+    def _get_repr_body_fields(self) -> List[Tuple[str, Union[str, int, List[str], None]]]:
         """Collect all information to display in the __repr__ body."""
         # Set up fields
         fields = {
diff --git a/openml/study/study.py b/openml/study/study.py
@@ -97,7 +97,7 @@ def _entity_letter(cls) -> str:
     def id(self) -> Optional[int]:
         return self.study_id
 
-    def _get_repr_body_fields(self) -> List[Tuple[str, Union[str, int, List[str]]]]:
+    def _get_repr_body_fields(self) -> List[Tuple[str, Union[str, int, List[str], None]]]:
         """Collect all information to display in the __repr__ body."""
         fields: Dict[str, Any] = {
             "Name": self.name,
diff --git a/openml/tasks/task.py b/openml/tasks/task.py
@@ -80,7 +80,7 @@ def _entity_letter(cls) -> str:
     def id(self) -> Optional[int]:
         return self.task_id
 
-    def _get_repr_body_fields(self) -> List[Tuple[str, Union[str, int, List[str]]]]:
+    def _get_repr_body_fields(self) -> List[Tuple[str, Union[str, int, List[str], None]]]:
         """Collect all information to display in the __repr__ body."""
         fields: Dict[str, Any] = {
             "Task Type Description": "{}/tt/{}".format(

Original file line number	Diff line number	Diff line change
`@@ -173,7 +173,7 @@ def extension(self):`
`173`	`173`	`"No extension could be found for flow {}: {}".format(self.flow_id, self.name)`
`174`	`174`	`)`
`175`	`175`
`176`		`- def _get_repr_body_fields(self) -> List[Tuple[str, Union[str, int, List[str]]]]:`
	`176`	`+ def _get_repr_body_fields(self) -> List[Tuple[str, Union[str, int, List[str], None]]]:`
`177`	`177`	`"""Collect all information to display in the __repr__ body."""`
`178`	`178`	`fields = {`
`179`	`179`	`"Flow Name": self.name,`