[Uludag-commits] r10165 - trunk/pisi/pisi/search
svn-uludag at uludag.org.tr
svn-uludag at uludag.org.tr
4 Kas 2006 Cmt 00:43:02 EET
Author: gurer
Date: Sat Nov 4 00:43:02 2006
New Revision: 10165
Modified:
trunk/pisi/pisi/search/__init__.py
trunk/pisi/pisi/search/preprocess.py
trunk/pisi/pisi/search/tokenize.py
Log:
preprocess fonksiyonu artık doğru çalışıyor.
rebuild-db de 1 snlik falan bir hızlanma var, pek kayda değmedi
o açıdan, asıl hız kazancı için db değişikliği gerekecek :(
pisi search çok daha iyi çalışıyor, ama gene de bazı aksamalar
var, sanırım api.py yada InvertedIndex içindeki kısımlarda da
bir sorun var.
Modified: trunk/pisi/pisi/search/__init__.py
=================================================================
--- trunk/pisi/pisi/search/__init__.py (original)
+++ trunk/pisi/pisi/search/__init__.py Sat Nov 4 00:43:02 2006
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
#
-# Copyright (C) 2005, TUBITAK/UEKAE
+# Copyright (C) 2005-2006, TUBITAK/UEKAE
#
# This program is free software; you can redistribute it and/or modify it under
# the terms of the GNU General Public License as published by the Free
@@ -54,7 +54,7 @@
ctx.invidx[id][lang].remove_doc(docid, terms, repo = repo, txn = txn)
def query_terms(id, lang, terms, repo = None, txn = None):
- terms = map(lambda x: p.lower(lang, x), terms)
+ terms = p.normalize(lang, terms)
return ctx.invidx[id][lang].query(terms, repo = repo, txn = txn)
def query(id, lang, str, repo = None, txn = None):
Modified: trunk/pisi/pisi/search/preprocess.py
=================================================================
--- trunk/pisi/pisi/search/preprocess.py (original)
+++ trunk/pisi/pisi/search/preprocess.py Sat Nov 4 00:43:02 2006
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
#
-# Copyright (C) 2005, TUBITAK/UEKAE
+# Copyright (C) 2005-2006, TUBITAK/UEKAE
#
# This program is free software; you can redistribute it and/or modify it under
# the terms of the GNU General Public License as published by the Free
@@ -11,30 +11,17 @@
#
import tokenize
+import locale
-def lowly_python(str):
- def lowly_char(c):
- if c=='I':
- lowly = 'i' # because of some fools we can't choose locale in lower
- else:
- lowly = c.lower()
- return c
-
- r = ""
- for c in str:
- r += lowly_char(c)
- return r
-
-def lower(lang, str):
- if lang=='tr':
- return lowly_python(str)
- else:
- return str.lower()
+def normalize(lang, terms):
+ if lang == "tr":
+ old_locale = locale.setlocale(locale.LC_CTYPE)
+ locale.setlocale(locale.LC_CTYPE, "tr_TR.UTF-8")
+ terms = map(lambda x: unicode(x).lower(), terms)
+ if lang == "tr":
+ locale.setlocale(locale.LC_CTYPE, old_locale)
+ return terms
def preprocess(lang, str):
terms = tokenize.tokenize(lang, str)
-
- # normalize
- terms = map(lambda x: lower(lang, x), terms)
-
- return terms
+ return normalize(lang, terms)
Modified: trunk/pisi/pisi/search/tokenize.py
=================================================================
--- trunk/pisi/pisi/search/tokenize.py (original)
+++ trunk/pisi/pisi/search/tokenize.py Sat Nov 4 00:43:02 2006
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
#
-# Copyright (C) 2005, TUBITAK/UEKAE
+# Copyright (C) 2005-2006, TUBITAK/UEKAE
#
# This program is free software; you can redistribute it and/or modify it under
# the terms of the GNU General Public License as published by the Free
@@ -15,10 +15,11 @@
def tokenize(lang, str):
if type(str) != type(unicode()):
str = unicode(str)
+ sepchars = string.whitespace + string.punctuation
tokens = []
token = unicode()
for x in str:
- if x in string.whitespace or x in string.punctuation:
+ if x in sepchars:
if len(token) > 0:
tokens.append(token)
token = unicode()
Uludag-commits mesaj listesiyle ilgili
daha fazla bilgi