Albert Feghaly · a7810c40 · 53615808 · 0ed729f2 · a7810c40 · a7810c40
--- a/pyiricdata/Client.py

+ 136

− 66
+++ b/pyiricdata/Client.py

+ 136

− 66
 @@ -42,12 +42,15 @@ class Client:

        if self.token is not None:
            if self.pwd is not None:
-                sys.stderr.write('WARNING: Ambiguous authentification, ignoring ' +
-                        'password in favor of PAT\n')
+                sys.stderr.write(
+                    'WARNING: Amiguous authentification, ' +
+                    'ignoring password in favor of PAT\n'
+                )
+
            self.token = self.token.strip()
            session.headers.update({'Iric-Auth-Token': F'{self.token}'})

-        elif self.user and self.pwd:
+        elif self.user is not None and self.pwd is not None:
            login_url = os.path.join(self.url, 'login/')
            session.get(login_url)
            csrftoken = session.cookies['csrftoken']
 @@ -61,56 +64,107 @@ class Client:
            login = session.post(login_url, cookies=cookies, data=payload,
                             headers=headers)

-            if login.status_code == 200:
-                if any(x in login.text for x in ['Erreur', 'Error']):
-                    raise IricDataConnectionError('Connexion failed -- verify ' +
-                                              'your username and password')
-                else:
-                    sys.stdout.write('Your connexion to IRIC-Data has been ' +
-                                 'established [user=%s]\n' % self.user)
-            else:
-                raise IricDataConnectionError('Could not initiate connexion ' +
-                                          'with IRIC-Data')
+            if login.status_code != 200:
+                raise IricDataConnectionError(
+                    'Could not initiate connexion with IRIC-Data.'
+                )
+            # ==> A cleaner way would be to verify route accesses directly
+            #     as done for PAT-based access
+            #elif any(x in login.text for x in ['Erreur', 'Error']):
+            #    raise IricDataConnectionError(
+            #        '[Connexion failed] Please verify ' +
+            #        'your username and password.'
+            #    )
+
        else:
-            raise IricDataConnectionError('Connexion failed -- please verify ' +
-                                          'that you submitted a token or a ' +
-                                          'username/password combination')
+            raise IricDataConnectionError(
+                '[Connexion failed] Missing PAT or ' +
+                'username/password combination.'
+            )

        self.session = session
+        self.datasets = self.get_available_datasets()
+        sys.stdout.write(
+            'Your connexion to IRIC-Data has been ' +
+            'established [user=%s]\n' % self.user
+        )

        try:
            self.labs = self.get_available_labs()
        except:
            self.labs = None
-            print('get_available_labs() method is not implemented in api/v1 of IRIC-Data')
+            sys.stderr.write(
+                'get_available_labs() method is not ' +
+                'implemented in api/v1 of IRIC-Data\n'
+            )
+        try:
+            self.datafiles = self.get_available_datafiles()
+        except:
+            self.datafiles = None
+            sys.stderr.write(
+                'get_available_datafiles() method is not ' +
+                'implemented in api/v1 of IRIC-Data\n'
+            )
+
+
+    def get_response(self, url, **kwargs):
+        """Manage GET requests to IRIC-Data
+
+        :param url: IRIC-Data view URL.
+        :param **kwargs: Optional arguments for request.session().get().
+        """
+        r = self.session.get(url, **kwargs)
+
+        if r.status_code == 200:
+            return r
+
+        elif r.status_code == 401:
+            raise IricDataConnectionError(
+                '[Identification failed] ' +
+                'Please double-check your identification information.'
+            )
+
+        elif r.status_code == 404:
+            sys.stderr.write(
+                '[Not found] ' +
+                'This request leads to non-existent data.\n'
+            )
+            return None
+
+        else:
+            raise IricDataConnectionError(
+                '[Error %d] ' % r.status_code +
+                'Cannot continue with your request.'
+            )

-        self.datasets = self.get_available_datasets()
-        #self.datafiles = self.get_available_datafiles()

    """ Return DataFrame with Name and ID for available labs """
    def get_available_labs(self):
-        r = self.session.get(os.path.join(
-            self.url, 'secure/datafiles/annotate'
-        ))
+        endpoint_url = os.path.join(self.url, 'secure/datafiles/annotate')
+        r = self.get_response(endpoint_url)
        soup = BeautifulSoup(r.text, 'html.parser')
        options = soup.find(id='id_lab').find_all('option')
+
        df = pd.DataFrame({
                'lab_id': [x['value'] for x in options],
                'lab_name': [x.contents[0] for x in options]})
+
        if ''.join(list(set(df.loc[0].lab_name))) == '-':
            df.drop(0, axis=0, inplace=True)
+
        df.index = df.lab_id.copy()
        df.index.name = 'ID'
+
        return df

    """ Fetch Dataset Name and Slug ID available to the user as JSON """
    def get_available_datasets(self):
-        r = self.session.get(
-            os.path.join(self.url, self.api_my_datasets)
-        ).json()
+        endpoint_url = os.path.join(self.url, self.api_my_datasets)
+        r = self.get_response(endpoint_url).json()
        df = pd.DataFrame(r['data'], columns=['dataset_name', 'dataset_slug'])
        df.index = df.dataset_slug.copy()
        df.index.name = 'ID'
+
        return df

    """ Return a DataFrame of user's files. A filter can be made on annotation
 @@ -124,7 +178,7 @@ class Client:
            return(None)
        elif key_anno is not None and value_anno is not None:
            if dataset_id is None:
-                r = self.session.get(
+                r = self.get_response(
                    os.path.join(
                        self.url,
                        self.api_datafiles_key_value,
 @@ -132,7 +186,7 @@ class Client:
                    )
                ).json()
            else:
-                r = self.session.get(
+                r = self.get_response(
                    os.path.join(
                        self.url,
                        self.api_datafiles_key_value,
 @@ -141,7 +195,7 @@ class Client:
                ).json()
        elif key_anno is not None:
            if dataset_id is None:
-                r = self.session.get(
+                r = self.get_response(
                    os.path.join(
                        self.url,
                        self.api_datafiles_key,
 @@ -149,7 +203,7 @@ class Client:
                    )
                ).json()
            else:
-                r = self.session.get(
+                r = self.get_response(
                    os.path.join(
                        self.url,
                        self.api_datafiles_key,
 @@ -158,7 +212,7 @@ class Client:
                ).json()
        elif value_anno is not None:
            if dataset_id is None:
-                r = self.session.get(
+                r = self.get_response(
                    os.path.join(
                        self.url,
                        self.api_datafiles_value,
 @@ -166,7 +220,7 @@ class Client:
                    )
                ).json()
            else:
-                r = self.session.get(
+                r = self.get_response(
                    os.path.join(
                        self.url,
                        self.api_datafiles_value,
 @@ -174,7 +228,7 @@ class Client:
                    )
                ).json()
        elif dataset_id is not None:
-            r = self.session.get(
+            r = self.get_response(
                os.path.join(
                    self.url,
                    self.api_datafiles_dataset,
 @@ -184,6 +238,7 @@ class Client:

        # id is internal to iric-data
        df = pd.DataFrame(r['data']).rename({'id': 'numerical_id'}, axis=1)
+
        if 'iric_data_id' in df.columns:
            df.index = df.iric_data_id
            df.index.name = 'ID'
 @@ -196,41 +251,41 @@ class Client:

    """ Return metadata JSON for a given file_id """
    def get_file_metadata(self, file_id):
-        metadata = self.session.get(os.path.join(
+        endpoint_url = os.path.join(
            self.url, self.api_datafiles_meta, str(file_id)
-        ))
-        try:
+        )
+        metadata = self.get_response(endpoint_url)
+        if metadata is not None:
            metadata = metadata.json()
-        # TODO except something:
-        except:
-            sys.stderr.write('ERROR: File %s does not exist in ' +
-                             'database\n' % str(file_id))
-            metadata = None
+
        return metadata

    """ Return annotations JSON for a given file_id """
    def get_file_annotation(self, file_id):
-        annotation = self.session.get(os.path.join(
-                self.url, self.api_datafiles_annotation, str(file_id)
-        ))
-        try:
+        endpoint_url = os.path.join(self.url,
+                self.api_datafiles_annotation, str(file_id))
+        annotation = self.get_response(endpoint_url)
+        if annotation is not None:
            annotation = annotation.json()
-        # TODO except something:
-        except:
-            sys.stderr.write('ERROR: File %s does not exist in ' +
-                             'database\n' % str(file_id))
-            annotation = None
        return annotation

+
+    def get_file_content(self, file_id):
+        """Returns datafile content for a given file_id"""
+
+        conn = self.get_file_data_conn(file_id)
+
+        if conn is not None:
+            return conn.content
+        else:
+            return None
+
+
    """ Return a connector of the contents of a file for a given file_id """
    def get_file_data_conn(self, file_id):
-        path = os.path.join(self.url, 'secure/datafiles/download', str(file_id))
-        try:
-            content = self.session.get(path, allow_redirects=True)
-        # TODO except something:
-        except:
-            sys.stderr.write('ERROR: File %s does not exist in database\n' % str(file_id))
-            content = None
+        dwnl_url = os.path.join(self.url,
+                'secure/datafiles/download', str(file_id))
+        content = self.get_response(dwnl_url)
        return content

    """ Return DataFrame (file_name, file_id, file_slug, file_hash) for
 @@ -243,31 +298,39 @@ class Client:

    """ Get a subset of the available datasets for which there is a match """
    def filter_datasets(self, term, exact_match=False):
+        df = self.datasets
        if exact_match:
-            return self.datasets[self.datasets.dataset_name.str.fullmatch(term)]
+            return df[df.dataset_name.str.fullmatch(term)]
        else:
-            return self.datasets[self.datasets.dataset_name.str.contains(term)]
+            return df[df.dataset_name.str.contains(term)]

    """ Get a subset of the available datafiles for which there is a match,
        kwargs refer to get_datafiles_list arguments
    """
    def filter_datafiles(self, term, field='filename', exact_match=False, **kwargs):
        df = self.get_datafiles_list(**kwargs)
-        if exact_match:
-            return df[df[field].str.fullmatch(term)]
+        if df is None:
+            sys.stderr.write("Please try again with more arguments, for more details please see function get_datafiles_list()")
+            return None
+        elif df.empty:
+            return None
        else:
-            return df[df[field].str.contains(term)]
+            if exact_match:
+                return df[df[field].str.fullmatch(term)]
+            else:
+                return df[df[field].str.contains(term)]

    """ Get file content according to file_id """
    def get_file(self, file_id):
        try:
            file_metadata = self.get_file_metadata(file_id)
-            file_content = self.get_file_data_conn(file_id).content
+            file_content = self.get_file_content(file_id)
            file_annotation = self.get_file_annotation(file_id)
            return IDF(file_metadata, file_content, file_annotation)
        except TypeError:
            return IDF(None, None, None)

+
    """ Download file according to file_id """
    def dwnl_file_content(self, file_id, folder_out=None, filename=None):
        file_meta = self.get_file_metadata(file_id)
 @@ -277,9 +340,10 @@ class Client:
                    folder_out = os.path.join(os.getcwd(), folder_out)
            else:
                folder_out = os.getcwd()
-            file_conn = self.get_file_data_conn(file_id)
            filename = file_meta['filename'] if filename is None else filename
            out_file_path = os.path.join(folder_out, filename)
+            file_conn = self.get_file_data_conn(file_id)
+            assert file_conn is not None  # should never be None here
            if os.path.exists(out_file_path):
                sys.stderr.write('Warning: File already exists at ' +
                                 'location %s, skipping.\n' % out_file_path)
 @@ -298,9 +362,9 @@ class Client:
                    folder_out = os.path.join(os.getcwd(), folder_out)
            else:
                folder_out = os.getcwd()
-            annotations = self.get_file_annotation(file_id)
            filename = file_meta['filename'] if filename is None else filename
            out_file_path = os.path.join(folder_out, filename + '.json')
+            annotations = self.get_file_annotation(file_id)
            if os.path.exists(out_file_path):
                sys.stderr.write('Warning: File already exists at ' +
                                 'location %s, skipping.\n' % out_file_path)
 @@ -309,11 +373,17 @@ class Client:
                with open(out_file_path, 'w') as outfile:
                    json.dump(annotations, outfile)

+
    """ Download an entire dataset """
    def dwnl_dataset(self, dataset_id, folder_out=None, datasetname=None):
        dataset = self.get_dataset_filelist(dataset_id)
        datasetname = self.datasets.loc[dataset_id].dataset_name if datasetname is None else datasetname
-        for file_id in np.unique(dataset.file_id):
+        if folder_out:
+            if folder_out[0] != '/':
+                folder_out = os.path.join(os.getcwd(), folder_out)
+        else:
+            folder_out = os.getcwd()
+        for file_id in np.unique(dataset.iric_data_id):
            self.dwnl_file_content(
                file_id, os.path.join(folder_out, datasetname)
            )
 @@ -486,7 +556,7 @@ class Client:
        file_id = str(file_id)

        update_url = os.path.join(self.url, 'secure/datafiles/update', file_id)
-        r = self.session.get(update_url)
+        r = self.get_response(update_url)
        soup = BeautifulSoup(r.text, 'html.parser')
        current_filename = soup.find(id='id_filename')['value']
        current_lab = soup.find(id='id_lab').find_all('option', selected=True)[0]['value']