security_tools/tools/ubuntu_package_puller/release.py

from urllib import request
from io import TextIOWrapper
from itertools import groupby
import re
import json
from meta_release import SourceMetaRelease, MetaRelease
import pdb


#RELEASE_WORD_MATCHER = r'(^\Origin\:\s|Label:\s|Suite:\s|Version:\s|Codename:\s|Date:\s|Architecture:\s|Components:\s|Description:\s|MD5Sum:\s|SHA256:\s|Aquire\-By\-Hash:\s)'

class Release:
    def __init__(self, distro_codename):
        # example url:http://old-releases.ubuntu.com/ubuntu/dists/hoary/Release
		# example distro_codename = 'hoary'
        self.distro_codename = distro_codename
        self.meta_release = self.changelog_release_file()

    def changelog_release_file(self):
        meta_release = SourceMetaRelease().meta_release_parse()
        return meta_release

    def distro_release_file_urls(self):
        meta_release_objects = []

        for d in self.meta_release:
            meta_release_obj = MetaRelease(d)
            dist_and_release_file_url = { meta_release_obj.dist: meta_release_obj.release_file }
            meta_release_objects.append(dist_and_release_file_url)

        return meta_release_objects

    def line_format(self, line):
        """ Use this method for cleaning the line, especially the one with the md5
        and shasums in them. """
        try:
            data = {}
            split_line_arr = line.split(':')
            split_line_arr_len = len(split_line_arr)

            if split_line_arr_len == 1:
                data = self.checksum_line_clean(split_line_arr)
                
            elif split_line_arr_len >= 2:
                k = split_line_arr[0].strip().lower()
                v = split_line_arr[1].strip()

                #if k == 'md5sum':
                #    v = []
                #elif k == 'sha256':
                #    v = []

                data[k] = v

            return data 
        except Exception as e:
            print(f'Failed to clean {line}')
            print(e.__doc__)
            print(e.message)

    def checksum_line_clean(self, checksum_line_arr):
        """
        in: [' 26f7612b4526f7b97b6b2f7abbdd59e5f83f879a0dbdcce799b7b91bc36387de            36973 universe/debian-installer/binary-sparc/Packages\n'] 
        out: { 'checksum': '26f7612b4526f7b97b6b2f7abbdd59e5f83f879a0dbdcce799b7b91bc36387de', 'filesize': '36973', 'filepath': 'universe/debian-installer/binary-sparc/Packages\n' } 
        """
        data = {}

        l = checksum_line_arr[0]
        checksum_size_and_filepath = l.strip().split(' ')

        while ('' in checksum_size_and_filepath):
            checksum_size_and_filepath.remove('')

        data['checksum'] = checksum_size_and_filepath[0]
        data['filesize'] = checksum_size_and_filepath[1]
        data['filepath'] = checksum_size_and_filepath[2]

        return data

    def validate_parse(self):
        """ Use this method for validating the entire returned dictionary. make
        sure it has the expected keys we want/expect. """
        return 

    def release_file_parse(self, release_file_url):
        """ Returns the parsed_release_file parsed as a list of dicts """
        data = {}

        with request.urlopen(f'{release_file_url}') as response:
            index_count = 0
            lines = TextIOWrapper(response, encoding='utf-8')
            lines = [ l for l in lines ] 

            cleaned_lines = []

            for l in lines:
                index_count += 1
                cleaned_line = self.line_format(l)

                # TODO: FINISH THIS. MAKE A FUNCTION THAT HANDLES THIS STUFF.
                if 'md5sum' in cleaned_line:
                    # this should get the dict with md5sum
                    # next one should be a range of each dicts checksum,
                    # filesize, and filepath until we reach sha256.
                    md5sum_index = index_count
                    start_md5_checksums = md5sum_index + 1
                # the older distros use sha1
                elif 'sha1' in cleaned_line:
                    sha1sum_index = index_count
                    start_sha1_checksums = sha1sum_index + 1
                    end_md5_checksums = sha1sum_index - 1
                    if 'aquire-by-hash' in cleaned_line:
                        aquire_by_hash_index = index_count 
                        end_sha1sum_checksums = aquire_by_hash_index - 1
                # newer distros use sha256
                elif 'sha256' in cleaned_line:
                    sha256sum_index = index_count
                    end_md5_checksums = sha256sum_index - 1
                    start_sha256_checksums = sha256sum_index + 1
                    if 'aquire-by-hash' in cleaned_line:
                        aquire_by_hash_index = index_count
                        end_sha256_checksums = aquire_by_hash_index - 1
                #else:
                    #cleaned_lines.append(cleaned_line)
                cleaned_lines.append(cleaned_line)


                # can we use list slicing to extract each range? we know where
                # the index is for each.

            md5sums = cleaned_lines[start_md5_checksums:end_md5_checksums]
            print(f'index of md5 start: {start_md5_checksums}')
            print(f'index of md5 end: {end_md5_checksums}')
            sha256sums = cleaned_lines[start_sha256_checksums:end_sha256_checksums]
            print(f'index of sha256 start: {start_sha256_checksums}')
            print(f'index of sha256 end: {end_sha256_checksums}')
            #sha1sums = cleaned_lines[start_sha1_checksums:end_sha1_checksums]
            #print(f'index of sha1 start: {start_sha1_checksums}')
            #print(f'index of sha1 end: {end_sha1_checksums}')
            #return cleaned_lines
            #return md5sums
            return sha256sums


if __name__ == '__main__':
    # testing

    r = Release('focal')
    release_file_urls = r.distro_release_file_urls()

    for meta_release_dict in release_file_urls:
        keys = meta_release_dict.keys()
        for distro in keys:
            url = meta_release_dict[distro]
            try:
                results = r.release_file_parse(url)
                for d in results:
                    print(d)
            except Exception as e:
                print(e.__doc__)
                print(e)
trying to add a release class now 2022-08-20 22:45:06 -05:00			`from urllib import request`
			`from io import TextIOWrapper`
			`from itertools import groupby`
			`import re`
			`import json`
			`from meta_release import SourceMetaRelease, MetaRelease`
almost got it parsing to a dict 2022-08-22 09:26:53 -05:00			`import pdb`
trying to add a release class now 2022-08-20 22:45:06 -05:00

working on sanitizing the line in the release file some more 2022-08-21 22:08:11 -05:00			`#RELEASE_WORD_MATCHER = r'(^\Origin\:\s\|Label:\s\|Suite:\s\|Version:\s\|Codename:\s\|Date:\s\|Architecture:\s\|Components:\s\|Description:\s\|MD5Sum:\s\|SHA256:\s\|Aquire\-By\-Hash:\s)'`
trying to add a release class now 2022-08-20 22:45:06 -05:00
			`class Release:`
			`def __init__(self, distro_codename):`
			`# example url:http://old-releases.ubuntu.com/ubuntu/dists/hoary/Release`
			`# example distro_codename = 'hoary'`
			`self.distro_codename = distro_codename`
			`self.meta_release = self.changelog_release_file()`

			`def changelog_release_file(self):`
			`meta_release = SourceMetaRelease().meta_release_parse()`
			`return meta_release`

			`def distro_release_file_urls(self):`
			`meta_release_objects = []`

			`for d in self.meta_release:`
			`meta_release_obj = MetaRelease(d)`
hacking on sanitizing the lines now in the release file 2022-08-21 07:12:21 -05:00			`dist_and_release_file_url = { meta_release_obj.dist: meta_release_obj.release_file }`
trying to add a release class now 2022-08-20 22:45:06 -05:00			`meta_release_objects.append(dist_and_release_file_url)`

			`return meta_release_objects`

hacking on sanitizing the lines now in the release file 2022-08-21 07:12:21 -05:00			`def line_format(self, line):`
			`""" Use this method for cleaning the line, especially the one with the md5`
			`and shasums in them. """`
			`try:`
working on sanitizing the line in the release file some more 2022-08-21 22:08:11 -05:00			`data = {}`
			`split_line_arr = line.split(':')`
almost got it parsing to a dict 2022-08-22 09:26:53 -05:00			`split_line_arr_len = len(split_line_arr)`

			`if split_line_arr_len == 1:`
trying to make the release parser now checksum aware and store the appropriate range of checksums into a k,v pair. trying to use list slicing 2022-08-22 10:54:32 -05:00			`data = self.checksum_line_clean(split_line_arr)`
almost got it parsing to a dict 2022-08-22 09:26:53 -05:00
			`elif split_line_arr_len >= 2:`
trying to make the release parser now checksum aware and store the appropriate range of checksums into a k,v pair. trying to use list slicing 2022-08-22 10:54:32 -05:00			`k = split_line_arr[0].strip().lower()`
			`v = split_line_arr[1].strip()`

			`#if k == 'md5sum':`
			`# v = []`
			`#elif k == 'sha256':`
			`# v = []`

almost got it parsing to a dict 2022-08-22 09:26:53 -05:00			`data[k] = v`

			`return data`
hacking on sanitizing the lines now in the release file 2022-08-21 07:12:21 -05:00			`except Exception as e:`
trying to make the release parser now checksum aware and store the appropriate range of checksums into a k,v pair. trying to use list slicing 2022-08-22 10:54:32 -05:00			`print(f'Failed to clean {line}')`
			`print(e.__doc__)`
			`print(e.message)`

			`def checksum_line_clean(self, checksum_line_arr):`
			`"""`
			`in: [' 26f7612b4526f7b97b6b2f7abbdd59e5f83f879a0dbdcce799b7b91bc36387de 36973 universe/debian-installer/binary-sparc/Packages\n']`
			`out: { 'checksum': '26f7612b4526f7b97b6b2f7abbdd59e5f83f879a0dbdcce799b7b91bc36387de', 'filesize': '36973', 'filepath': 'universe/debian-installer/binary-sparc/Packages\n' }`
			`"""`
			`data = {}`

			`l = checksum_line_arr[0]`
			`checksum_size_and_filepath = l.strip().split(' ')`

			`while ('' in checksum_size_and_filepath):`
			`checksum_size_and_filepath.remove('')`

			`data['checksum'] = checksum_size_and_filepath[0]`
			`data['filesize'] = checksum_size_and_filepath[1]`
			`data['filepath'] = checksum_size_and_filepath[2]`

			`return data`
hacking on sanitizing the lines now in the release file 2022-08-21 07:12:21 -05:00
			`def validate_parse(self):`
			`""" Use this method for validating the entire returned dictionary. make`
			`sure it has the expected keys we want/expect. """`
			`return`

trying to add a release class now 2022-08-20 22:45:06 -05:00			`def release_file_parse(self, release_file_url):`
hacking on sanitizing the lines now in the release file 2022-08-21 07:12:21 -05:00			`""" Returns the parsed_release_file parsed as a list of dicts """`
trying to make the release parser now checksum aware and store the appropriate range of checksums into a k,v pair. trying to use list slicing 2022-08-22 10:54:32 -05:00			`data = {}`

trying to add a release class now 2022-08-20 22:45:06 -05:00			`with request.urlopen(f'{release_file_url}') as response:`
trying to make the release parser now checksum aware and store the appropriate range of checksums into a k,v pair. trying to use list slicing 2022-08-22 10:54:32 -05:00			`index_count = 0`
trying to add a release class now 2022-08-20 22:45:06 -05:00			`lines = TextIOWrapper(response, encoding='utf-8')`
trying to make the release parser now checksum aware and store the appropriate range of checksums into a k,v pair. trying to use list slicing 2022-08-22 10:54:32 -05:00			`lines = [ l for l in lines ]`
hacking on sanitizing the lines now in the release file 2022-08-21 07:12:21 -05:00
			`cleaned_lines = []`

			`for l in lines:`
trying to make the release parser now checksum aware and store the appropriate range of checksums into a k,v pair. trying to use list slicing 2022-08-22 10:54:32 -05:00			`index_count += 1`
hacking on sanitizing the lines now in the release file 2022-08-21 07:12:21 -05:00			`cleaned_line = self.line_format(l)`

trying to make the release parser now checksum aware and store the appropriate range of checksums into a k,v pair. trying to use list slicing 2022-08-22 10:54:32 -05:00			`# TODO: FINISH THIS. MAKE A FUNCTION THAT HANDLES THIS STUFF.`
			`if 'md5sum' in cleaned_line:`
			`# this should get the dict with md5sum`
			`# next one should be a range of each dicts checksum,`
			`# filesize, and filepath until we reach sha256.`
			`md5sum_index = index_count`
			`start_md5_checksums = md5sum_index + 1`
			`# the older distros use sha1`
			`elif 'sha1' in cleaned_line:`
			`sha1sum_index = index_count`
			`start_sha1_checksums = sha1sum_index + 1`
			`end_md5_checksums = sha1sum_index - 1`
			`if 'aquire-by-hash' in cleaned_line:`
			`aquire_by_hash_index = index_count`
			`end_sha1sum_checksums = aquire_by_hash_index - 1`
			`# newer distros use sha256`
			`elif 'sha256' in cleaned_line:`
			`sha256sum_index = index_count`
			`end_md5_checksums = sha256sum_index - 1`
			`start_sha256_checksums = sha256sum_index + 1`
			`if 'aquire-by-hash' in cleaned_line:`
			`aquire_by_hash_index = index_count`
			`end_sha256_checksums = aquire_by_hash_index - 1`
			`#else:`
			`#cleaned_lines.append(cleaned_line)`
			`cleaned_lines.append(cleaned_line)`
trying to add a release class now 2022-08-20 22:45:06 -05:00

trying to make the release parser now checksum aware and store the appropriate range of checksums into a k,v pair. trying to use list slicing 2022-08-22 10:54:32 -05:00			`# can we use list slicing to extract each range? we know where`
			`# the index is for each.`
trying to add a release class now 2022-08-20 22:45:06 -05:00
trying to make the release parser now checksum aware and store the appropriate range of checksums into a k,v pair. trying to use list slicing 2022-08-22 10:54:32 -05:00			`md5sums = cleaned_lines[start_md5_checksums:end_md5_checksums]`
			`print(f'index of md5 start: {start_md5_checksums}')`
			`print(f'index of md5 end: {end_md5_checksums}')`
			`sha256sums = cleaned_lines[start_sha256_checksums:end_sha256_checksums]`
			`print(f'index of sha256 start: {start_sha256_checksums}')`
			`print(f'index of sha256 end: {end_sha256_checksums}')`
			`#sha1sums = cleaned_lines[start_sha1_checksums:end_sha1_checksums]`
			`#print(f'index of sha1 start: {start_sha1_checksums}')`
			`#print(f'index of sha1 end: {end_sha1_checksums}')`
			`#return cleaned_lines`
			`#return md5sums`
			`return sha256sums`
trying to add a release class now 2022-08-20 22:45:06 -05:00

			`if __name__ == '__main__':`
			`# testing`

			`r = Release('focal')`
			`release_file_urls = r.distro_release_file_urls()`

hacking on sanitizing the lines now in the release file 2022-08-21 07:12:21 -05:00			`for meta_release_dict in release_file_urls:`
			`keys = meta_release_dict.keys()`
			`for distro in keys:`
			`url = meta_release_dict[distro]`
			`try:`
			`results = r.release_file_parse(url)`
almost got it parsing to a dict 2022-08-22 09:26:53 -05:00			`for d in results:`
			`print(d)`
hacking on sanitizing the lines now in the release file 2022-08-21 07:12:21 -05:00			`except Exception as e:`
			`print(e.__doc__)`
trying to make the release parser now checksum aware and store the appropriate range of checksums into a k,v pair. trying to use list slicing 2022-08-22 10:54:32 -05:00			`print(e)`
trying to add a release class now 2022-08-20 22:45:06 -05:00