Following the events of September 11, 2001, there has been heightened attention in the United States and elsewhere to the use of multiple government and private databases for the identification of possible perpetrators of future attacks, as well as an unprecedented expansion of federal government data mining activities, many involving databases containing personal information. There have also been claims that prospective datamining could be used to find the {\textquotedblleft}signature{\textquotedblright} of terrorist cells embedded in larger networks. We present an overview of why the public has concerns about such activities and describe some proposals for the search of multiple databases which supposedly do not compromise possible pledges of confidentiality to the individuals whose data are included. We also explore their link to the related literatures on privacy-preserving data mining. In particular, we focus on the matching problem across databases and the concept of {\textquotedblleft}selective revelation{\textquotedblright} and their confidentiality implications.

}, isbn = {978-0-387-71612-1}, doi = {10.1007/978-0-387-71613-8_10}, url = {http://dx.doi.org/10.1007/978-0-387-71613-8_10}, author = {Stephen E. Fienberg}, editor = {Chen, Hsinchun and Reid, Edna and Sinai, Joshua and Silke, Andrew and Ganor, Boaz} } @inbook {1876, title = {How Large Is the World Wide Web?}, booktitle = {Web Dynamics}, year = {2004}, pages = {23-43}, publisher = {Springer Berlin Heidelberg}, organization = {Springer Berlin Heidelberg}, abstract = {There are many metrics one could consider for estimating the size of the World Wide Web, and in the present chapter we focus on size in terms of the number N of Web pages. Since a database with all the valid URLs on the Web cannot be constructed and maintained, determining N by counting is impossible. For the same reasons, estimating N by directly sampling from the Web is also infeasible. Instead of studying the Web as a whole, one can try to assess the size of the publicly indexable Web, which is the part of the Web that is considered for indexing by the major search engines. Several groups of researchers have invested considerable efforts to develop sound sampling schemes that involve submitting a number of queries to several major search engines. Lawrence and Giles [8] developed a procedure for sampling Web documents by submitting various queries to a number of search engines. We contrast their study with the one performed by Bharat and Broder [2] in November 1997. Although both experiments took place almost in the same period of time, their estimates are significantly different. In this chapter we review how the size of the indexable Web was estimated by three groups of researchers using three different statistical models: Lawrence and Giles 18, 9], Bharat and Broder [2] and Bradlow and Schmittlein 13]. Then we present a statistical framework for the analysis of data sets collected by query-based sampling, utilizing a hierarchical Bayes formulation of the Rasch model for multiple list population estimation developed in 16]. We explain why this approach seems to be in reasonable accord with the real-world constraints and thus allows us to make credible inferences about the size of the Web. We give two different methods that lead to credible estimates of the size of the Web in a reasonable amount of time and are also consistent with the real-world constraints.

}, isbn = {978-3-642-07377-9}, doi = {10.1007/978-3-662-10874-1_2}, url = {http://dx.doi.org/10.1007/978-3-662-10874-1_2}, author = {Adrian Dobra and Stephen E. Fienberg} } @article {1883, title = {A Hybrid High-Order Markov Chain Model for Computer Intrusion Detection}, volume = {10}, year = {2001}, pages = {277-295}, abstract = {A hybrid model based mostly on a high-order Markov chain and occasionally on a statistical-independence model is proposed for profiling command sequences of a computer user in order to identify a "signature behavior" for that user. Based on the model, an estimation procedure for such a signature behavior driven by maximum likelihood (ML) considerations is devised. The formal ML estimates are numerically intractable, but the ML-optimization problem can be substituted by a linear inverse problem with positivity constraint (LININPOS), for which the EM algorithm can be used as an equation solver to produce an approximate ML-estimate. The intrusion detection system works by comparing a user{\textquoteright}s command sequence to the user{\textquoteright}s and others{\textquoteright} estimated signature behaviors in real time through statistical hypothesis testing. A form of likelihood-ratio test is used to detect if a given sequence of commands is from the proclaimed user, with the alternative hypothesis being a masquerader user. Applying the model to real-life data collected from AT\&T Labs-Research indicates that the new methodology holds some promise for intrusion detection.

}, author = {Ju, W-H and Yehuda Vardi} } @article {1846, title = {A hybrid Markov chain for the Bayesian analysis of the multinomial probit model}, journal = {Statistics and Computing}, volume = {8}, number = {3}, year = {1998}, pages = {229-242}, publisher = {Kluwer Academic Publishers}, abstract = {Bayesian inference for the multinomial probit model, using the Gibbs sampler with data augmentation, has been recently considered by some authors. The present paper introduces a modification of the sampling technique, by defining a hybrid Markov chain in which, after each Gibbs sampling cycle, a Metropolis step is carried out along a direction of constant likelihood. Examples with simulated data sets motivate and illustrate the new technique. A proof of the ergodicity of the hybrid Markov chain is also given.

}, keywords = {Bayesian analysis, Gibbs sampling, Metropolis algorithm, Multinomial probit model}, issn = {0960-3174}, doi = {10.1023/A:1008905311214}, url = {http://dx.doi.org/10.1023/A\%3A1008905311214}, author = {Nobile, Agostino} } @article {1859, title = {On high level exceedance modeling and tail inference}, journal = {Journal of Statistical Planning and Inference}, volume = {45}, year = {1995}, pages = {247-280}, abstract = {This paper discusses a general framework common to some varied known and new results involving measures of threshold exceedance by high values of stationary stochastic sequences. In particular these concern the following. (a) Probabilistic modeling of infrequent but potentially damaging physical events such as storms, high stresses, high pollution episodes, describing both repeated occurrences and associated {\textquoteleft}damage{\textquoteright} magnitudes. (b) Statistical estimation of {\textquoteleft}tail parameters{\textquoteright} of a stationary stochastic sequence {Xj}. This includes a variety of estimation problems and in particular cases such as estimation of expected lengths of clusters of high values (e.g. storm durations), of interest in (a). {\textquoteleft}Very high{\textquoteright} values (leading to Poisson-based limits for exceedance statistics) and {\textquoteleft}high{\textquoteright} values (giving normal limits) are considered and exhibited as special cases within the general framework of central limit results for {\textquoteleft}random additive interval functions{\textquoteright}. The case of array sums of dependent random variables is revisited within this framework, clarifying the role of dependence conditions and providing minimal conditions for characterization of possible limit types. The methods are illustrated by the construction of confidence limits for the mean of an {\textquoteleft}exceedance statistic{\textquoteright} measuring high ozone levels, based on Philadelphia monitoring data.

}, keywords = {Central limit theory, Exceedance modeling, Extreme values, Tail estimation}, doi = {10.1016/0378-3758(94)00075-1}, author = {M. R. Leadbetter} }