This forum has been archived. All content is frozen. Please use KDE Discuss instead.

Hundreds of duplicate categories on import

Tags: None
(comma "," separated)
rickb
Registered Member
Posts
1
Karma
0
After lots of research and testing, we have decided to transfer 17 years of Quicken data to kmymoney. The import went fairly well except that for each category there are dozens of duplicates with a value of $0.

I can delete these extras one at a time, but but it will take hours. kmymoney doesn't seem to let you multi-select categories, which is the next best option. What I would really like is an option to list all categories with a 0 sum with the option to remove them.

But since none of those options seem to exist, does anyone have any ideas?

Thanks in advance.
zebulon
Registered Member
Posts
417
Karma
2
OS
You can delete a parent category and all of its subcategories in one movement. Is it possible to isolate your duplicate categories somehow? E.g, move your "good" categories under a temporary new parent? Then remove the parent of the bad categories and restore the original hierarchy for the good categories.

If nothing works, you can try scripting them out of the kmy database using sed/awk/perl.
rbross
Registered Member
Posts
3
Karma
0
I decompressed the kmy file, and what I found was that accounts (categories) and subaccounts do not have summary amounts. So I wrote a quick Python program that creates a dictionary of all used subaccounts and creates a new file that excludes any subaccounts that are not used.

To use it:

    Make a copy of your kmy file
    Rename the extension to .gz
    Decompress
    ./pcats.py myfile newfile
    Compress the newfile using gzip
    Rename the newfile with a kmy extension

Note that there will be output listing what subaccounts are active and inactive and deleted to stdout. So redirect stdout to a file if you want to save it and check it later.

Code: Select all
#!/usr/bin/python -OO
import sys
import xml.etree.cElementTree as ET

subaccount = '<SUBACCOUNT id="'

# Main
def main():

    if len(sys.argv) != 3:
        print "USAGE: pcats.py infile outfile"
        exit()

    file_path = sys.argv[1]
    outfile = sys.argv[2]

    try:
        context = ET.iterparse(file_path, events=("start", "end")) #Probably we could use only the start tag
    except IOError:
        print "Unable to open input file"
        exit()

        # turn it into an iterator
    context = iter(context)
    on_members_tag = False

    # Create a dict that stores all valid accounts
    accounts = dict()
    for event, elem in context:
        tag = elem.tag
        value = elem.text
        if value :
            value = value.encode('utf-8').strip()       
        if event == 'start' :
            if tag == "SPLIT" :
                attribs = elem.attrib
                accounts[attribs['account']] = True

        elem.clear() #Save memory!

    try:
        context = ET.iterparse(file_path, events=("start", "end")) #Probably we could use only the start tag
    except IOError:
        print "Unable to open input file"
        exit()

    # turn it into an iterator
    context = iter(context)
    on_members_tag = False

    # Output all accoaunts and how they are classified
    for event, elem in context:
        tag = elem.tag
        value = elem.text
        if value :
            value = value.encode('utf-8').strip()       
        if event == 'start' :
            if tag == "SUBACCOUNT" :
                attribs = elem.attrib
                if not accounts.get(attribs['id'], False):
                    print "Empty: " + attribs['id']
                else:
                    print "Active: " + attribs['id']

        elem.clear() #Save memory!

    # Read file for lines
    try:
        f = open(file_path)
        lines = f.readlines()
        f.close()
    except IOError:
        print "Unable to open input file"
        exit()

    # Open output file
    try:
        f = open(outfile, 'w')
    except IOError:
        print "Unable to create output file"
        exit()

    # Create new file excuding empty categories
    for line in lines:
        i = line.find(subaccount)
        if i != -1:
            i = i + len(subaccount)
            l2 = line[i:]
            i = l2.find('"')
            if i != -1:
                l2 = l2[:i]
                if not accounts.get(l2, False):
                    print "Deleted" + line,
                    continue
        f.write(line)
    f.close()


if __name__ == '__main__':
    main()
rbross
Registered Member
Posts
3
Karma
0
The previous code did not take into account transactions where the subaccounts were used in 0 splits. kmymoney would regenerate many of the subaccount categories for those transactions. This ugly code takes the additional step; I know, I could have used XPATH, but I was in a hurry to go to dinner. Note that this will remove voided transactions as well, since they sum to 0.

Once again,
1) rename your mykmy.mny file to mykmy.gz and decompress it.
2) Decompress it
3) run ./pcats mykmy newkmy
4) compress the new file to a gz file
5) rename to a kmy file

Code: Select all
#!/usr/bin/python -OO
import sys
import xml.etree.cElementTree as ET

subaccount = '<SUBACCOUNT id="'
transaction = '<TRANSACTION '
split = '<SPLIT '
value_zero = 'value="0/1"'
transaction_end = '</TRANSACTION'

# Main
def main():

    if len(sys.argv) != 3:
        print "USAGE: pcats.py infile outfile"
        exit()

    file_path = sys.argv[1]
    outfile = sys.argv[2]

    try:
        context = ET.iterparse(file_path, events=("start", "end")) #Probably we could use only the start tag
    except IOError:
        print "Unable to open input file"
        exit()

        # turn it into an iterator
    context = iter(context)
    on_members_tag = False

    # Create a dict that stores all valid accounts
    accounts = dict()
    for event, elem in context:
        tag = elem.tag
        value = elem.text
        if value :
            value = value.encode('utf-8').strip()       
        if event == 'start' :
            if tag == "SPLIT" :
                attribs = elem.attrib
                accounts[attribs['account']] = True

        elem.clear() #Save memory!

    try:
        context = ET.iterparse(file_path, events=("start", "end")) #Probably we could use only the start tag
    except IOError:
        print "Unable to open input file"
        exit()

    # turn it into an iterator
    context = iter(context)
    on_members_tag = False

    # Output all accounts and how they are classified
    for event, elem in context:
        tag = elem.tag
        value = elem.text
        if value :
            value = value.encode('utf-8').strip()       
        if event == 'start' :
            if tag == "SUBACCOUNT" :
                attribs = elem.attrib
                if not accounts.get(attribs['id'], False):
                    print "Empty: " + attribs['id']
                else:
                    print "Active: " + attribs['id']

        elem.clear() #Save memory!

    # At this point we should read the file as an etree, remove the subaccounts and accounts, and rewrite it

    # Read file for lines
    try:
        f = open(file_path)
        lines = f.readlines()
        f.close()
    except IOError:
        print "Unable to open input file"
        exit()

    # Open output file
    try:
        f = open(outfile, 'w')
    except IOError:
        print "Unable to create output file"
        exit()

    # Create new file excuding empty categories
    trans = False
    trans_lines = []
    for line in lines:
        # Transaction?
        i = line.find(transaction)
        if i != -1:
            trans = True

        # If we are in the middle of a transaction block, special processing.
        if trans:
            # Save line
            trans_lines.append(line)
            # Is this a split line?
            i = line.find(split)
            if i != -1:
                # Is it a zero transaction?
                i = line.find(value_zero)
                if i == -1:
                    # If any splits are not 0, write what we have so far and continue.  This is valid.
                    for sline in trans_lines:
                        f.write(sline)
                    trans = False
                    trans_lines = []
                    # Rest of lines in transaction will be processed normally
                    continue                   
            else:
                # End of transaction?
                i = line.find(transaction_end)
                if i != -1:
                    # If end of transaction, all splits were 0, so don't write
                    # DEBUG
                    for sline in trans_lines:
                        print "Deleting: ", sline
                    trans = False
                    trans_lines = []
                    continue                 
            continue

        i = line.find(subaccount)
        if i != -1:
            i = i + len(subaccount)
            l2 = line[i:]
            i = l2.find('"')
            if i != -1:
                l2 = l2[:i]
                if not accounts.get(l2, False):
                    print "Deleted: " + line,
                    continue
        f.write(line)
    f.close()


if __name__ == '__main__':
    main()
rbross
Registered Member
Posts
3
Karma
0
That previous script was faulty. Here is the correct one and the procedure.

1) Make a copy of your kmy file (for example, finance.kmy)
2) Rename it to have a "gz" extension: finance.gz
3) Extract the file ('gunzip finance.gz"). You should now have a "finance" file.
4) Using your favorite text editor, paste the following script and save it to 'kmymoney-clean.py'
5) Make it executable, 'chmod +x kmymoney-clean.py'
6) Run it: './kmymoney-clean.py finance finance.new'
7) Compress finance.new: 'gzip finance.new'
8) Rename your new file so kmymoney recognizes it: 'mv finance.new.gz finance2.kmy'
9) Open it with kmymoney and verifiy that it looks Ok

Code: Select all
#!/usr/bin/python -OO
import sys
import xml.etree.cElementTree as ET

# Start string
HEAD = '<?xml version="1.0" encoding="utf-8"?>\n<!DOCTYPE KMYMONEY-FILE>\n'

# Main
def main():
    if len(sys.argv) != 3:
        print "USAGE: pcats.py infile outfile"
        exit()

    file_path = sys.argv[1]
    outfile = sys.argv[2]

    try:
        tree = ET.parse(file_path)
    except IOError:
        print "Unable to open input file"
        exit()
   
    # Get the root
    root = tree.getroot()

    # Out "account is active" dict
    accounts = {}

    # Mark all used accounts
    for split in root.iter('SPLIT'):
        accounts[split.attrib['account']] = True

    # Now mark all parent accounts
    for accts in root.findall('ACCOUNTS'):
        for acct in accts.findall('ACCOUNT'):
            # Mark all parent accounts as used
            if acct.attrib['parentaccount'] == '':
                accounts[acct.attrib['id']] = True
                print "Marking parent", acct.attrib['id']
            else:
                accounts[acct.attrib['parentaccount']] = True
                print "Marking parent", acct.attrib['parentaccount']

    # Now remove dead accounts
    for accts in root.findall('ACCOUNTS'):
        for acct in accts.findall('ACCOUNT'):
            if accounts.get(acct.attrib['id'], False) == False:
                accts.remove(acct)
                print "Removing:", acct.attrib['id']
                continue;
            for subs in acct.findall('SUBACCOUNTS'):
                for sub in subs.findall('SUBACCOUNT'):
                    if accounts.get(sub.attrib['id'], False) == False:
                        print "Removing:", sub.attrib['id']
                        subs.remove(sub)

    doc = ET.tostring(root)
    f = open(outfile, "w")
    f.write(HEAD)
    f.write(doc)
    f.close()

if __name__ == '__main__':
    main()


Bookmarks



Who is online

Registered users: bartoloni, Bing [Bot], Google [Bot], q.ignora, watchstar