主要注意一下词表的中文编码,可以用sublime转换一下
写的不是很好也不太完善,比较粗略吧,结课以后如有机会我会完善的
—— 2017.10.27
# -*- coding: utf-8 -*-
__author__ = 'Zhao'
import re
import operator
# --------------- in this part we save the list as list ---------------
path = '/Users/apple/desktop/' # 这里可以随便改你存放的文件夹
fp = open(path + 'list.txt') # list.txt这里可以放你的词表的文件名
ori = fp.readlines()
# ori is the list with out any operation
copy = []
for x in ori:
x = re.sub(r'\n', '', x)
copy.append(x)
# in this part we change the format in a into standard format and save as copy
fp.close()
# we close the file, then we can run the list totally in this program
copy.sort()
# --------------- this part end ---------------
# in this part we know the average length in this list is 2, thus we set step as 5.
# In that case, we can contain at least one word.
# totally, there are 56064 words in this list and only 56 is longer than 5.
# In that case, 5 can be a reasonable step for this program.
# sum = 0
# num = 0
# for x in copy:
# sum += len(x)
# num += 1
# average = (int)(sum/num)
# print(average, ' ', num);
# max_lenth = 0
# for x in copy:
# if max_lenth < len(x):
# max_lenth = len(x)
#
# print(max_lenth)
# number = 0
# for x in copy:
# if len(x) > 5:
# number += 1
#
# print(number)
# --------------- the upper is the calculation in the preparation ---------------
str_input = input("请输入一个段落:\n")
str_input = re.sub(r',', "", str_input)
str_input = re.sub(r',', "", str_input)
str_input = re.sub(r'\.', "", str_input)
str_input = re.sub(r'。', "", str_input)
str_input = re.sub(r'——', "", str_input)
str_input = re.sub(r'……', "", str_input)
str_input = re.sub(r'!', "", str_input)
str_input = re.sub(r'!', "", str_input)
str_input = re.sub(r'\?', "", str_input)
str_input = re.sub(r'?', "", str_input)
str_input = re.sub(r';', "", str_input)
str_input = re.sub(r';', "", str_input)
str_input = re.sub(r' ', "", str_input)
# change all the punctuation as blank, however, we may split falsely.
# Words get around, the step can also split at wrong place, so, I do not fix this mistake.
str_head = 0
str_tail = len(str_input)
ptr = 5
temp = 0
step = 5
while temp < str_tail-1:
flag = 0
ptr = 5
while flag != 1:
in_put = str_input[temp:temp + ptr]
tail = len(copy)
head = 0
half = int((tail + head) / 2)
while tail != half and head != half:
if operator.lt(copy[half], in_put):
# 如果字符组的一半比input小
head = half
half = int((tail + head) / 2)
elif operator.gt(copy[half], in_put):
# 如果字符组的一半比input大
tail = half
half = int((tail + head) / 2)
else:
print(in_put, end='/')
flag = 1
temp += len(in_put)
break
if ptr == 0 and temp <= len(str_input)-1:
print(str_input[temp], end='/')
temp += 1
flag = 1
if flag == 0:
ptr -= 1
来源:CSDN
作者:zjugeek
链接:https://blog.csdn.net/zjugeek/article/details/78366048