[Django]-How to import data from scanned text into Django models


This is the working version I came up with. Dirty, but effective. Both @akonsu and @Ivan Kharlamov were helpful. Thanks…

import os, re, Levenshtein as lev, codecs
from SimpleQuiz.quiz.models import Choice, Question
from django.core.management.base import BaseCommand, CommandError
import optparse

class Command(BaseCommand):
    args = '--datapath=/path/to/text/data/'
    can_import_settings = True
    help = 'Imports scanned text into Questions and Choices'
    option_list = BaseCommand.option_list + (
        optparse.make_option('--datapath', action='store', type='string',
                             help='Path to OCRd text files to be parsed.'),
    requires_model_validation = True
    # Parser REs
    BACKUP_RE = re.compile(r'\~$|bak$|back$|backup$')
    QUEST_RE = re.compile(r'^[0-9]{1,3}[.][ ]')
    CHOICE_RE = re.compile(r'^[a-e][.][ ]')

    def handle(self, *args, **options):
        # get the data path
        except Exception as e:
            raise CommandError("None or invalid path provided: %s" % e.message)
        self.datapath = os.path.expanduser(options['datapath'])

        # generate list of text strings from lines in target files
        self.data_lines = []
        for fn in os.listdir(os.path.join(self.datapath, 'questions/')):
            if self.BACKUP_RE.search(fn):
                self.stderr.write("Skipping backup: %s\n" % (fn))
                for line in codecs.open(os.path.join(self.datapath, 'questions/', fn), 'r', encoding='latin-1'):
                    if not self.is_boilerplate(line):
                        if not line.strip() == '':

    #--------------------- Parse the text lines and create Questions/Choices
        cur_quest = None
        cur_choice = None
        cur_is_quest = False
        questions = {}
        choices = {}
        for line in self.data_lines:
            if self.is_question(line):
                [n, txt] = line.split('.', 1)
                qtext = txt.rstrip() + " "
                q = Question.objects.create(text=qtext)
                cur_quest = q.pk
                questions[cur_quest] = q
                cur_is_quest = True
            elif self.is_choice(line):
                [n, txt] = line.split('.', 1)
                num = self.char2dig(n)
                ctext = txt.rstrip() + " "
                c = Choice.objects.create(text=ctext, order=num, question=questions[cur_quest])
                cur_choice = c.pk
                choices[cur_choice] = c
                cur_is_quest = False
                if cur_is_quest:
                    questions[cur_quest].text += line.rstrip() + " "
                    choices[cur_choice].text += line.rstrip() + " "
        self.stdout.write("----- FINISHED -----\n")
        return None

    def is_question(self, arg_str):
        if self.QUEST_RE.search(arg_str):
            return True
            return False

    def is_choice(self, arg_str):
        if self.CHOICE_RE.search(arg_str):
            return True
            return False

    def char2dig(self, x):
        if x == 'a':
            return 1
        if x == 'b':
            return 2
        if x == 'c':
            return 3
        if x == 'd':
            return 4
        if x == 'e':
            return 5

    def is_boilerplate(self, arg_str):
        boilerplate = [u'MFT PRACTICE EXAMINATIONS',
                       u'BERKELEY TRAINING ASSOCIATES ' + u'\u00A9' + u' 2009',
                       u'BERKELEY TRAINING ASSOCIATES',
                       u'MARRIAGE AND FAMILY THERAPY',
                       u'PRACTICE EXAMINATION 41',
                       u'Page 0', u'Page 1', u'Page 2', u'Page 3', u'Page 4',
                       u'Page 5', u'Page 6', u'Page 7', u'Page 8', u'Page 9',
        for bp in boilerplate:
            if lev.distance(bp.encode('utf-8'), arg_str.encode('utf-8')) < 4:
                return True
        return False


I then tried to create a standalone script along the lines suggested
by [2] and [3] but was unable to overcome the kwargs = {“app_label”:
model_module.name.split(‘.’)[-2]} IndexError: list index out of
range error.

I had the same list index error. It was caused by the way I imported the models in my script. I used to do it like this:

from models import Table1, Table2

Then I realized the Python script is not part of the application, so I changed the import to:

from myapp.models import Table1, Table2

My Python script is started with the following shell script:

export PYTHONPATH=/path/to/my/site
python myscript.py "$@"

Leave a comment