Mercurial > hg > TextShaper
changeset 54:1d755747e67a
almost there
| author | Jeff Hammel <k0scist@gmail.com> | 
|---|---|
| date | Sun, 17 May 2015 09:11:30 -0700 | 
| parents | 3691ffa84a3a | 
| children | 4e2190495d50 | 
| files | textshaper/split.py | 
| diffstat | 1 files changed, 26 insertions(+), 5 deletions(-) [+] | 
line wrap: on
 line diff
--- a/textshaper/split.py Sun May 17 08:53:11 2015 -0700 +++ b/textshaper/split.py Sun May 17 09:11:30 2015 -0700 @@ -6,7 +6,7 @@ # imports import argparse -import re +import csv import string import sys @@ -47,10 +47,17 @@ sentence = text[begin:index] sentence += value sentence.strip() - begin = index + begin = index + len(value) if sentence: sentences.append(sentence) - import pdb; pdb.set_trace() + # add the trailing bits, if they exist + sentence = text[begin:].strip() + if sentence: + sentences.append(sentence) + # shouldn't need to do this + sentences = [sentence.strip() for sentence in sentences] + return sentences + def split_paragraphs(text): @@ -61,12 +68,23 @@ paragraphs = [' '.join(p) for p in text.split('\n')] return paragraphs +def words(text): + """return the alphanumeric words in a sentence""" + words = text.strip().split() + return [word for word in words] + def main(args=sys.argv[1:]): """CLI""" # parse command line arguments parser = argparse.ArgumentParser(description=__doc__) parser.add_argument('file', nargs='?', type=argparse.FileType('r'), default=sys.stdin) + parser.add_argument('-n', '--number', dest='number', + action='store_true', default=False, + help="number the sentences (CSV)") + parser.add_argument('-o', '--output', dest='output', + type=argparse.FileType('w'), default=sys.stdout, + help="file to output to, or stdout by default") options = parser.parse_args(args) # preprocess text @@ -79,8 +97,11 @@ sentences = split_sentences(text, ends) # display - for sentence in sentences: - print (sentence) + if options.number: + writer = csv.writer(options.output) + else: + for sentence in sentences: + options.output.write(sentence + '\n') if __name__ == '__main__': main()
