TextShaper: textshaper/split.py comparison

comparison textshaper/split.py @ 54:1d755747e67a

almost there

author	Jeff Hammel <k0scist@gmail.com>
date	Sun, 17 May 2015 09:11:30 -0700
parents	8d8c1ac0e8e1
children	4e2190495d50

comparison

equal deleted inserted replaced

-:3691ffa84a3a
+:1d755747e67a
 split paragraphs, sentences, etc
 """
 # imports
 import argparse
-import re
+import csv
 import string
 import sys
 def findall(_string, sub):
 begin = 0
 for index, value in _indices:
 sentence = text[begin:index]
 sentence += value
 sentence.strip()
-begin = index
+begin = index + len(value)
 if sentence:
 sentences.append(sentence)
-import pdb; pdb.set_trace()
+# add the trailing bits, if they exist
+sentence = text[begin:].strip()
+if sentence:
+sentences.append(sentence)
+# shouldn't need to do this
+sentences = [sentence.strip() for sentence in sentences]
+return sentences
 def split_paragraphs(text):
 lines = [line.strip() for line in text.strip().splitlines()]
 lines = [line if line else '\n'
 for line in lines]
 text = ' '.join(lines).strip()
 paragraphs = [' '.join(p) for p in text.split('\n')]
 return paragraphs
+def words(text):
+"""return the alphanumeric words in a sentence"""
+words = text.strip().split()
+return [word for word in words]
 def main(args=sys.argv[1:]):
 """CLI"""
 # parse command line arguments
 parser = argparse.ArgumentParser(description=__doc__)
 parser.add_argument('file', nargs='?', type=argparse.FileType('r'), default=sys.stdin)
+parser.add_argument('-n', '--number', dest='number',
+action='store_true', default=False,
+help="number the sentences (CSV)")
+parser.add_argument('-o', '--output', dest='output',
+type=argparse.FileType('w'), default=sys.stdout,
+help="file to output to, or stdout by default")
 options = parser.parse_args(args)
 # preprocess text
 text = options.file.read().strip()
 text = ' '.join(text.split())
 # find all sentences
 ends = '.?!'
 sentences = split_sentences(text, ends)
 # display
-for sentence in sentences:
+if options.number:
-print (sentence)
+writer = csv.writer(options.output)
+else:
+for sentence in sentences:
+options.output.write(sentence + '\n')
 if __name__ == '__main__':
 main()

Mercurial > hg > TextShaper

comparison textshaper/split.py @ 54:1d755747e67a