trying to make the release parser now checksum aware and store the appropriate range of checksums into a k,v pair. trying to use list slicing

2022-08-22 10:54:32 -05:00 · 2022-08-22 10:54:32 -05:00 · fff732f932
commit fff732f932
parent 26e55b186e
1 changed files with 77 additions and 53 deletions
--- a/tools/ubuntu_package_puller/release.py
+++ b/tools/ubuntu_package_puller/release.py
@ -35,85 +35,112 @@ class Release:
        and shasums in them. """
        try:
            data = {}
-            #cleaned_line = re.split(RELEASE_WORD_MATCHER, line)
            split_line_arr = line.split(':')
            split_line_arr_len = len(split_line_arr)

            if split_line_arr_len == 1:
-                # example: [' 26f7612b4526f7b97b6b2f7abbdd59e5f83f879a0dbdcce799b7b91bc36387de            36973 universe/debian-installer/binary-sparc/Packages\n'] 
-
-                l = split_line_arr[0]
-                checksum_size_and_filepath = l.strip().split(' ')
-
-                while ('' in checksum_size_and_filepath):
-                    checksum_size_and_filepath.remove('')
-
-                data['checksum'] = checksum_size_and_filepath[0]
-                data['filesize'] = checksum_size_and_filepath[1]
-                data['filepath'] = checksum_size_and_filepath[2]
+                data = self.checksum_line_clean(split_line_arr)
                
            elif split_line_arr_len >= 2:
-                k = split_line_arr[0].strip()
-                v = split_line_arr[1]
+                k = split_line_arr[0].strip().lower()
+                v = split_line_arr[1].strip()
+
+                #if k == 'md5sum':
+                #    v = []
+                #elif k == 'sha256':
+                #    v = []
+
                data[k] = v

            return data 
        except Exception as e:
-            print('failed to clean')
-            print(line)
-            print(e)
+            print(f'Failed to clean {line}')
+            print(e.__doc__)
+            print(e.message)
+
+    def checksum_line_clean(self, checksum_line_arr):
+        """
+        in: [' 26f7612b4526f7b97b6b2f7abbdd59e5f83f879a0dbdcce799b7b91bc36387de            36973 universe/debian-installer/binary-sparc/Packages\n'] 
+        out: { 'checksum': '26f7612b4526f7b97b6b2f7abbdd59e5f83f879a0dbdcce799b7b91bc36387de', 'filesize': '36973', 'filepath': 'universe/debian-installer/binary-sparc/Packages\n' } 
+        """
+        data = {}
+
+        l = checksum_line_arr[0]
+        checksum_size_and_filepath = l.strip().split(' ')
+
+        while ('' in checksum_size_and_filepath):
+            checksum_size_and_filepath.remove('')
+
+        data['checksum'] = checksum_size_and_filepath[0]
+        data['filesize'] = checksum_size_and_filepath[1]
+        data['filepath'] = checksum_size_and_filepath[2]
+
+        return data

    def validate_parse(self):
        """ Use this method for validating the entire returned dictionary. make
        sure it has the expected keys we want/expect. """
        return 

-    def md5_from_line(line):
-        return 
-
-    def sha256_from_line(line):
-        return
-
    def release_file_parse(self, release_file_url):
        """ Returns the parsed_release_file parsed as a list of dicts """
+        data = {}
+
        with request.urlopen(f'{release_file_url}') as response:
-            index_counter = 0
+            index_count = 0
            lines = TextIOWrapper(response, encoding='utf-8')
+            lines = [ l for l in lines ] 

            cleaned_lines = []

            for l in lines:
+                index_count += 1
                cleaned_line = self.line_format(l)
+
+                # TODO: FINISH THIS. MAKE A FUNCTION THAT HANDLES THIS STUFF.
+                if 'md5sum' in cleaned_line:
+                    # this should get the dict with md5sum
+                    # next one should be a range of each dicts checksum,
+                    # filesize, and filepath until we reach sha256.
+                    md5sum_index = index_count
+                    start_md5_checksums = md5sum_index + 1
+                # the older distros use sha1
+                elif 'sha1' in cleaned_line:
+                    sha1sum_index = index_count
+                    start_sha1_checksums = sha1sum_index + 1
+                    end_md5_checksums = sha1sum_index - 1
+                    if 'aquire-by-hash' in cleaned_line:
+                        aquire_by_hash_index = index_count 
+                        end_sha1sum_checksums = aquire_by_hash_index - 1
+                # newer distros use sha256
+                elif 'sha256' in cleaned_line:
+                    sha256sum_index = index_count
+                    end_md5_checksums = sha256sum_index - 1
+                    start_sha256_checksums = sha256sum_index + 1
+                    if 'aquire-by-hash' in cleaned_line:
+                        aquire_by_hash_index = index_count
+                        end_sha256_checksums = aquire_by_hash_index - 1
+                #else:
+                    #cleaned_lines.append(cleaned_line)
                cleaned_lines.append(cleaned_line)
-                #print(cleaned_line)

-            return cleaned_lines
-            #return parsed_release_file

-"""
-            stripped_lines = [
-                    re.split(RELEASE_WORD_MATCHER, l.strip()[::1]) for l in
-                    lines ]
-            print(stripped_lines)
-            grouped_lines = [list(group) for key, group in
-                    groupby(stripped_lines, lambda x: x == []) if not key]
-            list_of_dicts = []
+                # can we use list slicing to extract each range? we know where
+                # the index is for each.

-            for group in grouped_lines:
-                d = {}
-                # list of each group
-                for arr in group:
-                    arr_per_group = len(group)
-                    k = arr[0].lower().replace(":", "").strip()
-                    v = arr[1].strip()
+            md5sums = cleaned_lines[start_md5_checksums:end_md5_checksums]
+            print(f'index of md5 start: {start_md5_checksums}')
+            print(f'index of md5 end: {end_md5_checksums}')
+            sha256sums = cleaned_lines[start_sha256_checksums:end_sha256_checksums]
+            print(f'index of sha256 start: {start_sha256_checksums}')
+            print(f'index of sha256 end: {end_sha256_checksums}')
+            #sha1sums = cleaned_lines[start_sha1_checksums:end_sha1_checksums]
+            #print(f'index of sha1 start: {start_sha1_checksums}')
+            #print(f'index of sha1 end: {end_sha1_checksums}')
+            #return cleaned_lines
+            #return md5sums
+            return sha256sums

-                    # this builds up our dict by adding one k,v pair per key
-                    d[f"{k}"] = v
-
-                    list_of_dicts.append(d) if arr_per_group == len(d.keys()) else None 
-
-            return list_of_dicts
-"""

 if __name__ == '__main__':
    # testing
@ -127,12 +154,9 @@ if __name__ == '__main__':
            url = meta_release_dict[distro]
            try:
                results = r.release_file_parse(url)
-                print(type(results))
-                print(len(results))
                for d in results:
-                    print(type(d))
                    print(d)
            except Exception as e:
                print(e.__doc__)
-                print(e.message)
+                print(e)