[Uludag-commits] r10165 - trunk/pisi/pisi/search

svn-uludag at uludag.org.tr svn-uludag at uludag.org.tr
4 Kas 2006 Cmt 00:43:02 EET


Author: gurer
Date: Sat Nov  4 00:43:02 2006
New Revision: 10165

Modified:
   trunk/pisi/pisi/search/__init__.py
   trunk/pisi/pisi/search/preprocess.py
   trunk/pisi/pisi/search/tokenize.py
Log:
preprocess fonksiyonu artık doğru çalışıyor.

rebuild-db de 1 snlik falan bir hızlanma var, pek kayda değmedi
o açıdan, asıl hız kazancı için db değişikliği gerekecek :(

pisi search çok daha iyi çalışıyor, ama gene de bazı aksamalar
var, sanırım api.py yada InvertedIndex içindeki kısımlarda da
bir sorun var.


Modified: trunk/pisi/pisi/search/__init__.py
=================================================================
--- trunk/pisi/pisi/search/__init__.py	(original)
+++ trunk/pisi/pisi/search/__init__.py	Sat Nov  4 00:43:02 2006
@@ -1,6 +1,6 @@
 # -*- coding: utf-8 -*-
 #
-# Copyright (C) 2005, TUBITAK/UEKAE
+# Copyright (C) 2005-2006, TUBITAK/UEKAE
 #
 # This program is free software; you can redistribute it and/or modify it under
 # the terms of the GNU General Public License as published by the Free
@@ -54,7 +54,7 @@
     ctx.invidx[id][lang].remove_doc(docid, terms, repo = repo, txn = txn)
 
 def query_terms(id, lang, terms, repo = None, txn = None):
-    terms = map(lambda x: p.lower(lang, x), terms)
+    terms = p.normalize(lang, terms)
     return ctx.invidx[id][lang].query(terms, repo = repo, txn = txn)
 
 def query(id, lang, str, repo = None, txn = None):

Modified: trunk/pisi/pisi/search/preprocess.py
=================================================================
--- trunk/pisi/pisi/search/preprocess.py	(original)
+++ trunk/pisi/pisi/search/preprocess.py	Sat Nov  4 00:43:02 2006
@@ -1,6 +1,6 @@
 # -*- coding: utf-8 -*-
 #
-# Copyright (C) 2005, TUBITAK/UEKAE
+# Copyright (C) 2005-2006, TUBITAK/UEKAE
 #
 # This program is free software; you can redistribute it and/or modify it under
 # the terms of the GNU General Public License as published by the Free
@@ -11,30 +11,17 @@
 #
 
 import tokenize
+import locale
 
-def lowly_python(str):
-    def lowly_char(c):
-        if c=='I':
-            lowly = 'i'   # because of some fools we can't choose locale in lower
-        else:
-            lowly = c.lower()
-        return c
-
-    r = ""
-    for c in str:
-        r += lowly_char(c)
-    return r
-
-def lower(lang, str):
-    if lang=='tr':
-        return lowly_python(str)
-    else:
-        return str.lower()
+def normalize(lang, terms):
+    if lang == "tr":
+        old_locale = locale.setlocale(locale.LC_CTYPE)
+        locale.setlocale(locale.LC_CTYPE, "tr_TR.UTF-8")
+    terms = map(lambda x: unicode(x).lower(), terms)
+    if lang == "tr":
+        locale.setlocale(locale.LC_CTYPE, old_locale)
+    return terms
 
 def preprocess(lang, str):
     terms = tokenize.tokenize(lang, str)
-
-    # normalize
-    terms = map(lambda x: lower(lang, x), terms)
-
-    return terms
+    return normalize(lang, terms)

Modified: trunk/pisi/pisi/search/tokenize.py
=================================================================
--- trunk/pisi/pisi/search/tokenize.py	(original)
+++ trunk/pisi/pisi/search/tokenize.py	Sat Nov  4 00:43:02 2006
@@ -1,6 +1,6 @@
 # -*- coding: utf-8 -*-
 #
-# Copyright (C) 2005, TUBITAK/UEKAE
+# Copyright (C) 2005-2006, TUBITAK/UEKAE
 #
 # This program is free software; you can redistribute it and/or modify it under
 # the terms of the GNU General Public License as published by the Free
@@ -15,10 +15,11 @@
 def tokenize(lang, str):
     if type(str) != type(unicode()):
         str = unicode(str)
+    sepchars = string.whitespace + string.punctuation
     tokens = []
     token = unicode()
     for x in str:
-        if x in string.whitespace or x in string.punctuation:
+        if x in sepchars:
             if len(token) > 0:
                 tokens.append(token)
                 token = unicode()


Uludag-commits mesaj listesiyle ilgili daha fazla bilgi