2020
Conferences
Adrian Chifu; Josiane Mothe; Md Zia Ullah
Fair Exposure of Documents in Information Retrieval: a Community Detection Approach Conference
Joint Conference of the Information Retrieval Communities in Europe, CIRCLE2020 2020.
Abstract | Links | BibTeX | Tags: Document Communities, Document Network, Document Re-ranking, Fair Document Exposure, Information Retrieval, Information Systems
@conference{Chifu2020CIRCLE,
title = {Fair Exposure of Documents in Information Retrieval: a Community Detection Approach},
author = {Adrian Chifu and Josiane Mothe and Md Zia Ullah},
url = {https://www.irit.fr/CIRCLE/wp-content/uploads/2020/06/CIRCLE20_03.pdf},
year = {2020},
date = {2020-07-01},
booktitle = {Joint Conference of the Information Retrieval Communities in Europe},
series = {CIRCLE2020},
abstract = {While (mainly) designed to answer users’ needs, search engines and recommendation systems do not necessarily guarantee the exposure of the data they store and index while it can be essential for information providers. A recent research direction so called “fair” exposure of documents tackles this problem in information retrieval. It has mainly been cast into a re-ranking problem with constraints and optimization functions. This paper presents the first steps toward a new framework for fair document exposure. This framework is based on document linking and document com- munity detection; communities are used to rank the documents to be retrieved according to an information need. In addition to the first step of this new framework, we present its potential through both a toy example and a few illustrative examples from the 2019 TREC Fair Ranking Track data set.},
keywords = {Document Communities, Document Network, Document Re-ranking, Fair Document Exposure, Information Retrieval, Information Systems},
pubstate = {published},
tppubtype = {conference}
}
2019
Conferences
Josiane Mothe; Léa Laporte; Adrian-Gabriel Chifu
Predicting Query Difficulty in IR: Impact of Difficulty Definition Conference
2019 11th International Conference on Knowledge and Systems Engineering, KSE2019 IEEE 2019.
Abstract | Links | BibTeX | Tags: Information Retrieval, Query Difficulty Prediction, Query Features
@conference{mothe2019predicting,
title = {Predicting Query Difficulty in IR: Impact of Difficulty Definition},
author = {Josiane Mothe and Léa Laporte and Adrian-Gabriel Chifu},
url = {https://www.irit.fr/publis/SIG/2019_KSE_MLC.pdf},
year = {2019},
date = {2019-10-24},
urldate = {2019-01-01},
booktitle = {2019 11th International Conference on Knowledge and Systems Engineering},
pages = {1--6},
organization = {IEEE},
series = {KSE2019},
abstract = {While it exists information on about any topic on the web, we know from information retrieval (IR) evaluation programs that search systems fail to answer to some queries in an effective manner. System failure is associated to query difficulty in the IR literature. However, there is no clear definition of query difficulty. This paper investigates several ways of defining query difficulty and analyses the impact of these definitions on query difficulty prediction results. Our experiments show that the most stable definition across collections is a threshold-based definition of query difficulty classes.},
keywords = {Information Retrieval, Query Difficulty Prediction, Query Features},
pubstate = {published},
tppubtype = {conference}
}
2018
Journal Articles
Adrian-Gabriel Chifu; Florentina Hristea
Feature selection for spectral clustering: to help or not to help spectral clustering when performing sense discrimination for IR? Journal Article
In: Open Computer Science, vol. 8, no. 1, pp. 218–227, 2018.
Abstract | Links | BibTeX | Tags: Information Retrieval, Query Disambiguation, Spectral Clustering, Word Sense Discrimination
@article{chifu2018feature,
title = {Feature selection for spectral clustering: to help or not to help spectral clustering when performing sense discrimination for IR?},
author = {Adrian-Gabriel Chifu and Florentina Hristea},
url = {https://www.degruyter.com/view/journals/comp/8/1/article-p218.xml},
year = {2018},
date = {2018-12-01},
urldate = {2018-12-01},
journal = {Open Computer Science},
volume = {8},
number = {1},
pages = {218--227},
publisher = {Sciendo},
abstract = {Whether or not word sense disambiguation (WSD) can improve information retrieval (IR) results represents a topic that has been intensely debated over the years, with many inconclusive or contradictory conclusions. The most rarely used type of WSD for this task is the unsupervised one, although it has been proven to be beneficial at a large scale. Our study builds on existing research and tries to improve the most recent unsupervised method which is based on spectral clustering. It investigates the possible benefits of “helping” spectral clustering through feature selection when it performs sense discrimination for IR. Results obtained so far, involving large data collections, encourage us to point out the importance of feature selection even in the case of this advanced, state of the art clustering technique that is known for performing its own feature weighting. By suggesting an improvement of what we consider the most promising approach to usage of WSD in IR, and by commenting on its possible extensions, we state that WSD still holds a promise for IR and hope to stimulate continuation of this line of research, perhaps at an even more successful level.},
keywords = {Information Retrieval, Query Disambiguation, Spectral Clustering, Word Sense Discrimination},
pubstate = {published},
tppubtype = {article}
}
2016
Conferences
Adrian Chifu; Serge Molina; Josiane Mothe
MyBestQuery: A serious game to collect manual query reformulation Conference
Colloque Veille Stratégique Scientifique et Technologique (VSST 2016), Rabat (Morocco), 2016.
Abstract | Links | BibTeX | Tags: Human Annotation, Information Retrieval, Query Reformulation, Serious Game
@conference{ChifuVSST2016,
title = {MyBestQuery: A serious game to collect manual query reformulation},
author = {Adrian Chifu and Serge Molina and Josiane Mothe},
url = {https://oatao.univ-toulouse.fr/18853/1/2016_VSST_CMM.pdf},
year = {2016},
date = {2016-10-18},
urldate = {2016-10-18},
booktitle = {Colloque Veille Stratégique Scientifique et Technologique (VSST 2016), Rabat (Morocco)},
abstract = {This paper presents MyBestQuery, a serious game designed to collect query reformulations from players. Query reformulation is a hot topic in information retrieval and covers many aspects. One of them is query reformulation analysis which is based on users’ session. It can be used to understand user's intent or to measure his satisfaction with regards to the results he obtained when querying the search engine. Automatic query reformulation is another aspect of query reformulation. It automatically expands the initial user’s query in order to improve the quality of the retrieved document set. This mechanism relies on document analysis but could also benefit from manually reformulated query analysis. Web search engines collect millions of search sessions and possible query reformulations. As academics, this information is hardly accessible for us. MyBestQuery is designed as a serious game in order to collect various possible reformulation users suggest. The more long-term objective of this work is to analyse the humanly produced query reformulation in order to both analyse manual query reformulation and compare them with the automatically produced reformulations. Preliminary results are reported in this paper.},
keywords = {Human Annotation, Information Retrieval, Query Reformulation, Serious Game},
pubstate = {published},
tppubtype = {conference}
}
2015
Journal Articles
Julie Ayter; Adrian Chifu; Sébastien Déjean; Cecile Desclaux; Josiane Mothe
Statistical analysis to establish the importance of information retrieval parameters Journal Article
In: Journal of Universal Computer Science, vol. 21, no. 13, pp. pp–1767, 2015.
Abstract | Links | BibTeX | Tags: Information Retrieval, IR System Parameter, Query Clustering, Query Difficulty, Random Forest
@article{ayter2015statistical,
title = {Statistical analysis to establish the importance of information retrieval parameters},
author = {Julie Ayter and Adrian Chifu and Sébastien Déjean and Cecile Desclaux and Josiane Mothe},
url = {https://hal.archives-ouvertes.fr/hal-01592043/document},
year = {2015},
date = {2015-12-01},
urldate = {2015-12-01},
journal = {Journal of Universal Computer Science},
volume = {21},
number = {13},
pages = {pp--1767},
abstract = {Search engines are based on models to index documents, match queries and documents and rank documents. Research in Information Retrieval (IR) aims at defining these models and their parameters in order to optimize the results. Using benchmark collections, it has been shown that there is not a best system configura- tion that works for any query, but rather that performance varies from one query to another. It would be interesting if a meta-system could decide which system config- uration should process a new query by learning from the context of previousqueries. This paper reports a deep analysis considering more than 80,000 search engine config- urations applied to 100 queries and the corresponding performance. The goal of the analysis is to identify which configuration responds best to a certain type of query. We considered two approaches to define query types: one is post-evaluation, based on query clustering according to the performance measured with Average Precision, while the second approach is pre-evaluation, using query features (including query difficulty predictors) to cluster queries. Globally, we identified two parameters that should be optimized: retrieving model and TrecQueryTags process. One could ex- pect such results as these two parameters are major components of IR process. However our work results in two main conclusions: 1/ based on post-evaluation approach, we found that retrieving model is the most influential parameter for easy queries while TrecQueryTags process is for hard queries; 2/ for pre-evaluation, current query fea- tures do not allow to cluster queries to identify differences in the influential parameters.},
key = {Information Retrieval, query difficulty, query clustering, IR system pa- rameters, Random Forest},
keywords = {Information Retrieval, IR System Parameter, Query Clustering, Query Difficulty, Random Forest},
pubstate = {published},
tppubtype = {article}
}
Adrian-Gabriel Chifu; Florentina Hristea; Josiane Mothe; Marius Popescu
Word sense discrimination in information retrieval: A spectral clustering-based approach Journal Article
In: Information Processing & Management, vol. 51, no. 2, pp. 16–31, 2015.
Abstract | Links | BibTeX | Tags: High Precision, Information Retrieval, Spectral Clustering, Word Sense Disambiguation, Word Sense Discrimination
@article{chifu2015word,
title = {Word sense discrimination in information retrieval: A spectral clustering-based approach},
author = {Adrian-Gabriel Chifu and Florentina Hristea and Josiane Mothe and Marius Popescu},
url = {https://hal.archives-ouvertes.fr/hal-01153775/document},
year = {2015},
date = {2015-03-01},
urldate = {2015-01-01},
journal = {Information Processing & Management},
volume = {51},
number = {2},
pages = {16--31},
publisher = {Elsevier},
abstract = {Word sense ambiguity has been identified as a cause of poor precision in information retrieval (IR) systems. Word sense disambiguation and discrimination methods have been defined to help systems choose which documents should be retrieved in relation to an ambiguous query. However, the only approaches that show a genuine benefit for word sense discrimination or disambiguation in IR are generally supervised ones. In this paper we propose a new unsupervised method that uses word sense discrimination in IR. The method we develop is based on spectral clustering and reorders an initially retrieved doc- ument list by boosting documents that are semantically similar to the target query. For several TREC ad hoc collections we show that our method is useful in the case of queries which contain ambiguous terms. We are interested in improving the level of precision after 5, 10 and 30 retrieved documents (P@5, P@10, P@30) respectively. We show that precision can be improved by 8% above current state-of-the-art baselines. We also focus on poor performing queries.},
keywords = {High Precision, Information Retrieval, Spectral Clustering, Word Sense Disambiguation, Word Sense Discrimination},
pubstate = {published},
tppubtype = {article}
}
Conferences
Radu Tudor Ionescu; Adrian-Gabriel Chifu; Josiane Mothe
International Symposium on String Processing and Information Retrieval, SPIRE2015 Springer 2015.
Abstract | Links | BibTeX | Tags: Document Topic Distribution, Information Retrieval, Kurtosis, LDA, Ranking Retrieval Systems, Skewness, Topic Modeling
@conference{ionescu2015deshato,
title = {DeShaTo: Describing the Shape of Cumulative Topic Distributions to Rank Retrieval Systems without Relevance Judgments},
author = {Radu Tudor Ionescu and Adrian-Gabriel Chifu and Josiane Mothe},
url = {https://oatao.univ-toulouse.fr/15354/1/ionescu_15354.pdf},
year = {2015},
date = {2015-09-01},
urldate = {2015-01-01},
booktitle = {International Symposium on String Processing and Information Retrieval},
pages = {75--82},
organization = {Springer},
series = {SPIRE2015},
abstract = {This paper investigates an approach for estimating the effectiveness of any IR system. The approach is based on the idea that a set of documents retrieved for a specific query is highly relevant if there are only a small number of predominant topics in the retrieved documents. The proposed approach is to determine the topic probability distribution of each document offline, using Latent Dirichlet Allocation. Then, for a retrieved set of documents, a set of probability distribution shape descriptors, namely the skewness and the kurtosis, are used to compute a score based on the shape of the cumulative topic distribution of the respective set of documents. The proposed model is termed DeShaTo, which is short for Describing the Shape of cumulative Topic distributions. In this work, DeShaTo is used to rank retrieval systems without relevance judgments. In most cases, the empirical results are better than the state of the art approach. Compared to other approaches, DeShaTo works independently for each system. Therefore, it remains reliable even when there are less systems to be ranked by relevance.},
keywords = {Document Topic Distribution, Information Retrieval, Kurtosis, LDA, Ranking Retrieval Systems, Skewness, Topic Modeling},
pubstate = {published},
tppubtype = {conference}
}
Adrian Chifu; Léa Laporte; Josiane Mothe
La prédiction efficace de la difficulté des requêtes : une tâche impossible ? Conference
Conférence francophone en Recherche d'Information et Applications (CORIA 2015), Paris, 2015.
Abstract | Links | BibTeX | Tags: Data Mining, Evaluation, Information Retrieval, Query Difficulty Prediction
@conference{ChifuCORIA2015,
title = {La prédiction efficace de la difficulté des requêtes : une tâche impossible ?},
author = {Adrian Chifu and Léa Laporte and Josiane Mothe},
url = {https://oatao.univ-toulouse.fr/15263/1/chifu_15263.pdf},
year = {2015},
date = {2015-03-18},
booktitle = {Conférence francophone en Recherche d'Information et Applications (CORIA 2015), Paris},
abstract = {Résumé :
Les moteurs de recherche d’information (RI) retrouvent des réponses quelle que soit la requête, mais certaines requêtes sont difficiles (le système n’obtient pas de bonne performance en termes de mesure de RI). Pour les requêtes difficiles, des traitements ad-hoc doivent être appliqués. Prédire qu’une requête est difficile est donc crucial et différents prédicteurs ont été proposés. Dans cet articlenous étudions la variété de l’information captée par les prédicteurs existants et donc leur non redondance. Par ailleurs, nous montrons que les corrélations entre les prédicteurs et les performance des systèmes donnent peu d’espoir sur la capacité de ces prédicteurs à être réellement efficaces. Enfin, nous étudions la capacité des prédicteurs à prédire les classes de difficulté des requêtes en nous appuyant sur une variété de méthodes exploratoires et d’apprentissage. Nous montrons que malgré les (faibles) corrélations observées avec les mesures de performance, les prédicteurs actuels conduisent à des performances de prédiction variables et sont donc difficilement utilisables dans une application concrète de RI.
Abstract:
Search engines found answers whatever the user query is, but some queries are more difficult than others for the system. For difficult queries, adhoc treatments must be applied. Predicting query difficulty is crucial and different predictors have been proposed. In this paper, we revisit these predictors. First we check the non statistical redundancy of predictors. Then, we show that the correlation between the values of predictors and system performance gives little hope on the ability of these predictors to be effective. Finally, we study the ability of predictors to predict the classes of difficulty by relying on a variety of exploratory and learning methods. We show that despite the (low) correlation with performance measures, current predictors are not robust enough to be used in practical IR applications.},
keywords = {Data Mining, Evaluation, Information Retrieval, Query Difficulty Prediction},
pubstate = {published},
tppubtype = {conference}
}
Les moteurs de recherche d’information (RI) retrouvent des réponses quelle que soit la requête, mais certaines requêtes sont difficiles (le système n’obtient pas de bonne performance en termes de mesure de RI). Pour les requêtes difficiles, des traitements ad-hoc doivent être appliqués. Prédire qu’une requête est difficile est donc crucial et différents prédicteurs ont été proposés. Dans cet articlenous étudions la variété de l’information captée par les prédicteurs existants et donc leur non redondance. Par ailleurs, nous montrons que les corrélations entre les prédicteurs et les performance des systèmes donnent peu d’espoir sur la capacité de ces prédicteurs à être réellement efficaces. Enfin, nous étudions la capacité des prédicteurs à prédire les classes de difficulté des requêtes en nous appuyant sur une variété de méthodes exploratoires et d’apprentissage. Nous montrons que malgré les (faibles) corrélations observées avec les mesures de performance, les prédicteurs actuels conduisent à des performances de prédiction variables et sont donc difficilement utilisables dans une application concrète de RI.
Abstract:
Search engines found answers whatever the user query is, but some queries are more difficult than others for the system. For difficult queries, adhoc treatments must be applied. Predicting query difficulty is crucial and different predictors have been proposed. In this paper, we revisit these predictors. First we check the non statistical redundancy of predictors. Then, we show that the correlation between the values of predictors and system performance gives little hope on the ability of these predictors to be effective. Finally, we study the ability of predictors to predict the classes of difficulty by relying on a variety of exploratory and learning methods. We show that despite the (low) correlation with performance measures, current predictors are not robust enough to be used in practical IR applications.
2014
Conferences
Julie Ayter; Cecile Desclaux; Adrian Chifu; Josiane Mothe; Sébastien Déjean
Performance Analysis of Information Retrieval Systems Conference
Spanish Conference on Information Retrieval (CERI2014), Coruna, 2014, 2014.
Abstract | Links | BibTeX | Tags: Adaptive Information Retrieval, Classification, Information Retrieval, Optimization, Query Difficulty, Random Forest
@conference{nokey,
title = {Performance Analysis of Information Retrieval Systems},
author = {Julie Ayter and Cecile Desclaux and Adrian Chifu and Josiane Mothe and Sébastien Déjean},
url = {https://hal.archives-ouvertes.fr/hal-01119086/document},
year = {2014},
date = {2014-06-01},
urldate = {2014-06-01},
booktitle = {Spanish Conference on Information Retrieval (CERI2014), Coruna, 2014},
abstract = {It has been shown that there is not a best information retrieval system configuration which would work for any query, but rather that performance can vary from one query to another. It would be interesting if a meta-system could decide which system should process a new query by learning from the context of previously submitted queries. This paper reports a deep analysis considering more than 80,000 search engine configurations applied to 100 queries and the corresponding performance. The goal of the analysis is to identify which search engine configuration responds best to a certain type of query. We considered two approaches to define query types: one is based on query clustering according to the query performance (their difficulty), while the other approach uses various query features (including query difficulty predictors) to cluster queries. We identified two parameters that should be optimized first. An important outcome is that we could not obtain strong conclusive results; considering the large number of systems and methods we used, this result could lead to the conclusion that current query features does not fit the optimizing problem.},
keywords = {Adaptive Information Retrieval, Classification, Information Retrieval, Optimization, Query Difficulty, Random Forest},
pubstate = {published},
tppubtype = {conference}
}
2013
Conferences
Adrian-Gabriel Chifu
Prédire la Difficulté des Requêtes: la Combinaison de Mesures Statistiques et Sémantiques Conference
COnférence francophone en Recherche d'Information et Applications, CORIA2013 2013.
Abstract | Links | BibTeX | Tags: Combined Predictors, Information Retrieval, Measure Correlation, Query Ambiguity, Query Difficulty, Query Performance Prediction
@conference{chifu2013predire,
title = {Prédire la Difficulté des Requêtes: la Combinaison de Mesures Statistiques et Sémantiques},
author = {Adrian-Gabriel Chifu},
url = {https://hal.archives-ouvertes.fr/hal-01145833/document},
year = {2013},
date = {2013-04-03},
urldate = {2013-01-01},
booktitle = {COnférence francophone en Recherche d'Information et Applications},
pages = {pp--191},
series = {CORIA2013},
abstract = {The performance of an Information Retrieval System (IRS) is closely related to the query. The queries that lead to retrieval failure are referenced in the literature as "difficult queries". This study aims at analysing, adapting and combining several difficulty predictors. The evaluation of the prediction is based on the correla- tion between the predicted difficulty and the IRS performance. As predictors, we have considered an ambiguity predictor, the IDF measure and a score distribution measure. We show that combining the proposed predictors, produce good results. The evaluation framework consists in the TREC7 and TREC8 ahdoc collections.},
keywords = {Combined Predictors, Information Retrieval, Measure Correlation, Query Ambiguity, Query Difficulty, Query Performance Prediction},
pubstate = {published},
tppubtype = {conference}
}